# Clean Up our Intake Data

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt

In [2]:
# Utility for displaying our DataFrames
from IPython.display import display_html
def display_side_by_side(*args):
    html_str=''
    for df in args:
        html_str+=df.to_html()
    display_html(html_str.replace('table','table style="display:inline"'),raw=True)

In [3]:
from datetime import date, datetime

def calculate_age(born):
    today = date.today()
    if born in ['---', '']:
        return ''
    born = datetime.strptime(born, '%Y-%m-%d')
    return today.year - born.year - ((today.month, today.day) < (born.month, born.day))

np_age = np.vectorize(calculate_age)

In [4]:
def concat(col1, col2, debug=False):
    prohibited = ['', '---', 'nan']
    if col1 not in prohibited:
        return col1
    elif col2 not in prohibited:
        return col2
    else:
        return ''

np_concat = np.vectorize(concat)

In [5]:
def age_concat(col1, col2):
    prohibited = ['', '---', 'nan']
    col1 = 0 if col1 in prohibited else float(col1)
    col2 = 0 if col2 in prohibited else float(col2)
    col1 = 0 if np.isnan(col1) else col1
    col2 = 0 if np.isnan(col2) else col2
    return col1 if col1 != 0 else col2

np_age_concat = np.vectorize(age_concat)

In [6]:
def lat(col):
    try:
        return float(col.split(' ')[0])
    except:
        return 0
get_lat = np.vectorize(lat)

def lng(col):
    try:
        return float(col.split(' ')[1])
    except:
        return 0
get_lng = np.vectorize(lng)

### First let us read in our unclean export data

In [7]:
unclean = pd.read_csv('../match-data/manual-match/export.csv')
unclean.replace(['---'], '')

Unnamed: 0,number,formid,form.id.city,form.id.interview_date,form.int.has_job,form.int.has_other_income,form.int.months_unemployed,form.cd.first_name,form.cd.last_name,form.cd.mobile_num,...,form.jp.opposite_gender_coworkers,form.rw.expected_wage,form.ed.comp,form.ed.eng,form.we.jname,form.jp.oppgen,form.jp.oppgen1,form.rw.rwage,form.ed.highest_edu_level,form.int.nation
0,0,uuid:232d8d82-0c11-4aa2-8a76-b4a244f60ca6,3,,,,,,,,...,,,0,0,,1,1,220,,2
1,1,uuid:032b493f-e90e-42d8-ba55-66da02b70ec4,3,,,,,,,,...,,,0,0,شركة حسني جربوع للمقاولات,1,1,350,,2
2,2,uuid:04b3cb22-7f12-4d7b-ad89-25f65b0dcdbe,3,,,,,,,,...,,,0,0,,1,1,300,,2
3,3,uuid:326c85f5-ed1d-40fa-a370-17b43c2b0ea9,3,,,,,,,,...,,,0,0,,0,0,200,,2
4,4,uuid:bdc35371-ba9d-4d1d-a678-3a1744c32297,3,,,,,,,,...,,,0,1,,1,0,300,,2
5,5,uuid:74f432df-c4b6-4af3-94c2-48cdb0a68fa4,3,,,,,,,,...,,,0,0,,1,1,300,,2
6,6,uuid:644eb4c7-b08b-4511-ab94-f3a949fb3e2a,3,,,,,,,,...,,,0,0,,,0,300,,2
7,7,uuid:129638e4-bce9-41b7-a933-779a595fc428,3,,,,,,,,...,,,1,0,,1,1,250,,2
8,8,uuid:c185457a-a199-4784-93db-d501d7282229,3,,,,,,,,...,,,2,1,,1,0,250,,2
9,9,uuid:d4f974d1-a152-413f-ba0f-d90817fd1429,3,,,,,,,,...,,,1,1,الانترسوس,1,1,350,,2


### Now we will merge, reformat columns and create a new "clean" data frame

In [8]:
clean = pd.DataFrame()

# Clean Nationality
unclean['form.int.nation'] = unclean['form.int.nation'].replace(['1'],
                                                                'jordanian')
unclean['form.int.nation'] = unclean['form.int.nation'].replace(['2'],
                                                                'syrian')
unclean['form.int.nation'] = unclean['form.int.nation'].replace(['---'], '')

unclean['form.int.nationality'] = unclean['form.int.nationality'].replace(
    ['---'], '')
unclean['nationalty'] = unclean['form.int.nationality'].map(
    str) + unclean['form.int.nation']
clean['nationality'] = unclean['nationalty'].replace([''], 'unknown')

# Clean Gender
unclean['form.dem.male'] = unclean['form.dem.male'].replace(['0'], 'female')
unclean['form.dem.male'] = unclean['form.dem.male'].replace(['1'], 'male')
unclean['form.dem.male'] = unclean['form.dem.male'].replace(['---'], '')
unclean['form.dem.gender'] = unclean['form.dem.gender'].replace(['---'], '')
clean[
    'gender'] = unclean['form.dem.gender'].map(str) + unclean['form.dem.male']

# Clean Education Level
unclean['form.ed.edu1'] = unclean['form.ed.edu1'].replace(['1'], 'none')
unclean['form.ed.edu1'] = unclean['form.ed.edu1'].replace(['2'], 'primary')
unclean['form.ed.edu1'] = unclean['form.ed.edu1'].replace(['3'], 'secondary')
unclean['form.ed.edu1'] = unclean['form.ed.edu1'].replace(['4'], 'college')
unclean['form.ed.edu1'] = unclean['form.ed.edu1'].replace(['5'], 'diploma')
unclean['form.ed.edu1'] = unclean['form.ed.edu1'].replace(['6'], 'bachelors')
unclean['form.ed.edu1'] = unclean['form.ed.edu1'].replace(['7'], 'masters')
unclean['form.ed.edu1'] = unclean['form.ed.edu1'].replace(['7'], 'doctorate')
unclean['form.ed.edu1'] = unclean['form.ed.edu1'].replace(['---'], '')
unclean['form.ed.highest_edu_level'] = unclean[
    'form.ed.highest_edu_level'].replace(['---'], '')
unclean['edu'] = unclean['form.ed.edu1'].map(
    str) + unclean['form.ed.highest_edu_level']
clean['education'] = unclean['edu'].replace([''], 'unknown')

# Clean Age
unclean['form.dem.age'] = unclean['form.dem.age'].replace(['---'], '')
unclean['form.dem.age_calc'] = unclean['form.dem.age_calc'].replace(['---'],
                                                                    '')
unclean['dob_age'] = np_age(unclean['form.dem.dob'])

unclean['age_concat'] = np_age_concat(unclean['form.dem.age'],
                                      unclean['form.dem.age_calc'])
clean['age'] = np_age_concat(unclean['age_concat'], unclean['dob_age'])

# Clean Night Shfit
unclean['form.jp.night'] = unclean['form.jp.night'].replace(['---'], '')
unclean['form.jp.will_work_night_shift'] = unclean[
    'form.jp.will_work_night_shift'].replace(['---'], '')
clean['night-shift'] = np_concat(unclean['form.jp.night'],
                                 unclean['form.jp.will_work_night_shift'])

#clean['lat'] = get_lat(unclean['form.id.gps'])
#clean['lng'] = get_lng(unclean['form.id.gps'])

unclean['form.cd.first_name'] = unclean['form.cd.first_name'].replace(['---'],
                                                                      '')
unclean['form.cd.fname'] = unclean['form.cd.fname'].replace(['---'], '')
unclean['form.cd.last_name'] = unclean['form.cd.last_name'].replace(['---'],
                                                                    '')
unclean['form.cd.lname'] = unclean['form.cd.lname'].replace(['---'], '')

# clean['first-name'] = np_concat(unclean['form.cd.first_name'],
#                                 unclean['form.cd.fname'])
# clean['last-name'] = np_concat(unclean['form.cd.last_name'],
#                                unclean['form.cd.lname'])

# Job
unclean['form.int.has_job'] = unclean['form.int.has_job'].replace(['---'], '')
unclean['form.int.job'] = unclean['form.int.job'].replace(['---'], '')
clean['has_job'] = np_concat(unclean['form.int.has_job'], unclean['form.int.job'])

unclean['form.int.has_other_income'] = unclean[
    'form.int.has_other_income'].replace(['---'], '')
unclean['form.int.entrep'] = unclean['form.int.entrep'].replace(['---'], '')
clean['has_other_income'] = np_concat(unclean['form.int.has_other_income'], unclean['form.int.entrep'])

unclean['form.cd.mobile_num'] = unclean['form.cd.mobile_num'].replace(['---'],'')
unclean['form.cd.mob'] = unclean['form.cd.mob'].replace(['---'], '')
# clean['mobile_number'] = np_concat(unclean['form.cd.mobile_num'], unclean['form.cd.mob'])

unclean['form.dem.marital_status'] = unclean[
    'form.dem.marital_status'].replace(['---'], '')
unclean['form.dem.marr'] = unclean['form.dem.marr'].replace(['---'], '')

unclean['form.dem.marr'] = unclean['form.dem.marr'].replace(['1'], 'married')
unclean['form.dem.marr'] = unclean['form.dem.marr'].replace(['2'], 'single')
unclean['form.dem.marr'] = unclean['form.dem.marr'].replace(['3'], 'divoriced')
unclean['form.dem.marr'] = unclean['form.dem.marr'].replace(['4'], 'widowed')
clean['marital_status'] = np_concat(unclean['form.dem.marital_status'], unclean['form.dem.marr'])

unclean['form.dem.num_children'] = unclean['form.dem.num_children'].replace(
    ['---'], '')
unclean['form.dem.child'] = unclean['form.dem.child'].replace(['---'], '')
clean['num_children'] = np_concat(unclean['form.dem.num_children'], unclean['form.dem.child'])

unclean['form.tn.num_training_courses'] = unclean[
    'form.tn.num_training_courses'].replace(['---'], '')
unclean['form.tn.tn3'] = unclean['form.tn.tn3'].replace(['---'], '')
clean['num_training_courses'] = np_concat(unclean['form.tn.num_training_courses'],
                                          unclean['form.tn.tn3'])

unclean['form.jp.will_live_in_dorm'] = unclean[
    'form.jp.will_live_in_dorm'].replace(['---'], '')
unclean['form.jp.dorm'] = unclean['form.jp.dorm'].replace(['---'], '')
clean['will_live_in_dorm'] = np_concat(unclean['form.jp.will_live_in_dorm'],
                                       unclean['form.jp.dorm'])

unclean['form.jp.weekly_days_willing_to_work'] = unclean[
    'form.jp.weekly_days_willing_to_work'].replace(['---'], '')
unclean['form.jp.days'] = unclean['form.jp.days'].replace(['---'], '')
clean['weekly_days_willing_to_work'] = np_concat(unclean['form.jp.weekly_days_willing_to_work'],
                                                 unclean['form.jp.days'])

unclean['form.jp.daily_hours_willing_to_work'] = unclean[
    'form.jp.daily_hours_willing_to_work'].replace(['---'], '')
unclean['form.jp.hours'] = unclean['form.jp.hours'].replace(['---'], '')
clean['daily_hours_willing_to_work'] = np_concat(unclean['form.jp.daily_hours_willing_to_work'],
                                                 unclean['form.jp.hours'])

unclean['form.cj.workspace_preference_calc'] = unclean[
    'form.cj.workspace_preference_calc'].replace(['---'], '')
unclean['form.cj.cjsect'] = unclean['form.cj.cjsect'].replace(['---'], '')

unclean['form.cj.cjsect'] = unclean['form.cj.cjsect'].replace(['1'], 'office')
unclean['form.cj.cjsect'] = unclean['form.cj.cjsect'].replace(['2'], 'home')
unclean['form.cj.cjsect'] = unclean['form.cj.cjsect'].replace(['3'], 'souk')
unclean['form.cj.cjsect'] = unclean['form.cj.cjsect'].replace(['4'], 'farm')
unclean['form.cj.cjsect'] = unclean['form.cj.cjsect'].replace(['5'], 'factory')
unclean['form.cj.cjsect'] = unclean['form.cj.cjsect'].replace(['6'], 'mine')
unclean['form.cj.cjsect'] = unclean['form.cj.cjsect'].replace(['7'], 'construction_site')
unclean['form.cj.cjsect'] = unclean['form.cj.cjsect'].replace(['8'], 'garage')
unclean['form.cj.cjsect'] = unclean['form.cj.cjsect'].replace(['9'], 'transport')
unclean['form.cj.cjsect'] = unclean['form.cj.cjsect'].replace(['10'], 'religious_building')
unclean['form.cj.cjsect'] = unclean['form.cj.cjsect'].replace(['11'], 'hospital')
unclean['form.cj.cjsect'] = unclean['form.cj.cjsect'].replace(['12'], 'school')
unclean['form.cj.cjsect'] = unclean['form.cj.cjsect'].replace(['13'], 'hotel')
unclean['form.cj.cjsect'] = unclean['form.cj.cjsect'].replace(['14'], 'restaraunt')

clean['workspace_preference'] = np_concat(unclean['form.cj.workspace_preference_calc'],
                                          unclean['form.cj.cjsect'])

unclean['form.we.years_exp'] = unclean['form.we.years_exp'].replace(['---'],
                                                                    '')
unclean['form.we.yexp'] = unclean['form.we.yexp'].replace(['---'], '')
clean['years_experience'] = np_concat(unclean['form.we.years_exp'], unclean['form.we.yexp'])

unclean['form.ex.comments'] = unclean['form.ex.comments'].replace(['---'], '')
unclean['form.com2'] = unclean['form.com2'].replace(['---'], '')
clean['comments1'] = unclean['form.ex.comments']
clean['comments2'] = unclean['form.com2']

unclean['form.ed.comp'] = unclean['form.ed.comp'].replace(['---'], '')

unclean['form.ed.comp'] = unclean['form.ed.comp'].replace(['0'], 'none')
unclean['form.ed.comp'] = unclean['form.ed.comp'].replace(['1'], 'basic')
unclean['form.ed.comp'] = unclean['form.ed.comp'].replace(['2'], 'word_processing')
unclean['form.ed.comp'] = unclean['form.ed.comp'].replace(['3'], 'specialized')
unclean['form.ed.comp'] = unclean['form.ed.comp'].replace(['4'], 'programmer')

unclean['form.ed.computer_exprience'] = unclean[
    'form.ed.computer_exprience'].replace(['---'], '')
clean['computer_experience'] = np_concat(unclean['form.ed.comp'], 
                                           unclean['form.ed.computer_exprience'])

unclean['form.ed.eng'] = unclean['form.ed.eng'].replace(['---'], '')

unclean['form.ed.eng'] = unclean['form.ed.eng'].replace(['0'], 'not_literate')
unclean['form.ed.eng'] = unclean['form.ed.eng'].replace(['1'], 'few_phrases')
unclean['form.ed.eng'] = unclean['form.ed.eng'].replace(['2'], 'simple_phrases')
unclean['form.ed.eng'] = unclean['form.ed.eng'].replace(['3'], 'extended_conversation')
unclean['form.ed.eng'] = unclean['form.ed.eng'].replace(['4'], 'literate')

unclean['form.ed.english_proficiency'] = unclean[
    'form.ed.english_proficiency'].replace(['---'], '')
clean['english_proficiency'] = np_concat(unclean['form.ed.eng'],
                                         unclean['form.ed.english_proficiency'])

unclean['form.jp.opposite_gender_manager'] = unclean[
    'form.jp.opposite_gender_manager'].replace(['---'], '')
unclean['form.jp.oppgen'] = unclean['form.jp.oppgen'].replace(['---'], '')

unclean['form.jp.oppgen'] = unclean['form.jp.oppgen'].replace(['0'], 'will_refuse')
unclean['form.jp.oppgen'] = unclean['form.jp.oppgen'].replace(['1'], 'no_issue')

clean['opposite_gender_manager'] = np_concat(unclean['form.jp.opposite_gender_manager'],
                                             unclean['form.jp.oppgen'])

unclean['form.jp.opposite_gender_coworkers'] = unclean[
    'form.jp.opposite_gender_coworkers'].replace(['---'], '')
unclean['form.jp.oppgen1'] = unclean['form.jp.oppgen1'].replace(['---'], '')
unclean['form.jp.oppgen1'] = unclean['form.jp.oppgen1'].replace(['0'], 'will_refuse')
unclean['form.jp.oppgen1'] = unclean['form.jp.oppgen1'].replace(['1'], 'no_issue')
clean['opposite_gender_coworkers'] = np_concat(unclean['form.jp.opposite_gender_coworkers'],
                                             unclean['form.jp.oppgen1'])

unclean['form.rw.expected_wage'] = unclean['form.rw.expected_wage'].replace(
    ['---'], '')
unclean['form.rw.rwage'] = unclean['form.rw.rwage'].replace(['---'], '')
clean['expected_wage'] = np_concat(unclean['form.rw.expected_wage'], unclean['form.rw.rwage'])

clean

Unnamed: 0,nationality,gender,education,age,night-shift,has_job,has_other_income,marital_status,num_children,num_training_courses,...,daily_hours_willing_to_work,workspace_preference,years_experience,comments1,comments2,computer_experience,english_proficiency,opposite_gender_manager,opposite_gender_coworkers,expected_wage
0,syrian,female,secondary,48.0,0,0,0,married,0,,...,8,,,,,none,not_literate,no_issue,no_issue,220
1,syrian,male,secondary,0.0,1,0,0,married,0,,...,8,,1,,,none,not_literate,no_issue,no_issue,350
2,syrian,male,primary,44.0,1,0,0,married,7,,...,8,,,,,none,not_literate,no_issue,no_issue,300
3,syrian,female,secondary,28.0,0,0,0,married,3,,...,6,,,,,none,not_literate,will_refuse,will_refuse,200
4,syrian,female,diploma,40.0,0,0,0,married,6,3,...,5,,,,,none,few_phrases,no_issue,will_refuse,300
5,syrian,female,primary,0.0,0,0,0,married,3,,...,8,,,,,none,not_literate,no_issue,no_issue,300
6,syrian,female,none,53.0,0,0,0,married,,,...,4,,,,,none,not_literate,,will_refuse,300
7,syrian,female,secondary,20.0,0,0,0,single,0,1,...,8,,,لايوجد,,basic,not_literate,no_issue,no_issue,250
8,syrian,female,secondary,19.0,0,0,0,single,0,1,...,5,,,,,word_processing,few_phrases,no_issue,will_refuse,250
9,syrian,female,bachelors,25.0,0,0,0,single,0,4,...,6,,3,,,basic,few_phrases,no_issue,no_issue,350


### Lastly, lets save our cleaned data out as a csv

In [9]:
clean.to_csv('../match-data/manual_clean.csv')