# Clean Up our Intake Data

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt

from util import (
    np_age, np_concat, np_age_concat, get_lat, get_lng, 
    update_profession)

In [2]:
# Utility for displaying our DataFrames
from IPython.display import display_html
def display_side_by_side(*args):
    html_str=''
    for df in args:
        html_str+=df.to_html()
    display_html(html_str.replace('table','table style="display:inline"'),raw=True)

### First let us read in our unclean export data

In [3]:
unclean = pd.read_csv('../match-data/manual-match/export.csv')
unclean.replace([''], '---')
unclean.replace(['---'], '')

unclean

Unnamed: 0,number,formid,form.id.city,form.id.interview_date,form.int.has_job,form.int.has_other_income,form.int.months_unemployed,form.cd.first_name,form.cd.last_name,form.cd.mobile_num,...,form.we.jname,form.jp.oppgen,form.jp.oppgen1,form.rw.rwage,form.ed.highest_edu_level,form.int.nation,form.id.gps,form.jp.will_work_qiz,form.jp.will_train_unpaid,form.jp.days_willing_train_unpaid
0,0,uuid:232d8d82-0c11-4aa2-8a76-b4a244f60ca6,3,---,---,---,---,---,---,---,...,---,1,1,220,---,2,32.56341362 35.82743754 527.0 13.65,---,---,---
1,1,uuid:032b493f-e90e-42d8-ba55-66da02b70ec4,3,---,---,---,---,---,---,---,...,شركة حسني جربوع للمقاولات,1,1,350,---,2,32.56381511 35.82414651 538.0 106.18,---,---,---
2,2,uuid:04b3cb22-7f12-4d7b-ad89-25f65b0dcdbe,3,---,---,---,---,---,---,---,...,---,1,1,300,---,2,32.57277106 35.80145862 587.0 25.78,---,---,---
3,3,uuid:326c85f5-ed1d-40fa-a370-17b43c2b0ea9,3,---,---,---,---,---,---,---,...,---,0,0,200,---,2,32.57245561 35.80178963 512.0 13.65,---,---,---
4,4,uuid:bdc35371-ba9d-4d1d-a678-3a1744c32297,3,---,---,---,---,---,---,---,...,---,1,0,300,---,2,32.53877962 35.84859773 637.0 16.68,---,---,---
5,5,uuid:74f432df-c4b6-4af3-94c2-48cdb0a68fa4,3,---,---,---,---,---,---,---,...,---,1,1,300,---,2,32.55391673 35.80441486 569.0 25.78,---,---,---
6,6,uuid:644eb4c7-b08b-4511-ab94-f3a949fb3e2a,3,---,---,---,---,---,---,---,...,---,,0,300,---,2,,---,---,---
7,7,uuid:129638e4-bce9-41b7-a933-779a595fc428,3,---,---,---,---,---,---,---,...,---,1,1,250,---,2,32.5053713 35.8638544 0.0 20.0,---,---,---
8,8,uuid:c185457a-a199-4784-93db-d501d7282229,3,---,---,---,---,---,---,---,...,---,1,0,250,---,2,32.53869682 35.84859372 627.0 4.55,---,---,---
9,9,uuid:d4f974d1-a152-413f-ba0f-d90817fd1429,3,---,---,---,---,---,---,---,...,الانترسوس,1,1,350,---,2,32.5065657 35.8884198 0.0 20.0,---,---,---


In [4]:
def map_cities(col):
    col = col.replace(['1'], 'mafraq')
    col = col.replace(['2'], 'amman')
    col = col.replace(['3'], 'irbid')
    col = col.replace(['4'], 'zarqa')
    return col

### Now we will merge, reformat columns and create a new "clean" data frame

In [5]:
clean = pd.DataFrame()

# Clean Nationality
unclean['form.int.nation'] = unclean['form.int.nation'].replace(['1'],
                                                                'jordanian')
unclean['form.int.nation'] = unclean['form.int.nation'].replace(['2'],
                                                                'syrian')
unclean['form.int.nation'] = unclean['form.int.nation'].replace(['---'], '')

unclean['form.int.nationality'] = unclean['form.int.nationality'].replace(
    ['---'], '')
unclean['nationalty'] = unclean['form.int.nationality'].map(
    str) + unclean['form.int.nation']
clean['nationality'] = unclean['nationalty'].replace([''], 'unknown')

# Clean Gender
unclean['form.dem.male'] = unclean['form.dem.male'].replace(['0'], 'female')
unclean['form.dem.male'] = unclean['form.dem.male'].replace(['1'], 'male')
unclean['form.dem.male'] = unclean['form.dem.male'].replace(['---'], '')
unclean['form.dem.gender'] = unclean['form.dem.gender'].replace(['---'], '')
clean[
    'gender'] = unclean['form.dem.gender'].map(str) + unclean['form.dem.male']

# Clean Education Level
unclean['form.ed.edu1'] = unclean['form.ed.edu1'].replace(['1'], 'none')
unclean['form.ed.edu1'] = unclean['form.ed.edu1'].replace(['2'], 'primary')
unclean['form.ed.edu1'] = unclean['form.ed.edu1'].replace(['3'], 'secondary')
unclean['form.ed.edu1'] = unclean['form.ed.edu1'].replace(['4'], 'college')
unclean['form.ed.edu1'] = unclean['form.ed.edu1'].replace(['5'], 'diploma')
unclean['form.ed.edu1'] = unclean['form.ed.edu1'].replace(['6'], 'bachelors')
unclean['form.ed.edu1'] = unclean['form.ed.edu1'].replace(['7'], 'masters')
unclean['form.ed.edu1'] = unclean['form.ed.edu1'].replace(['7'], 'doctorate')
unclean['form.ed.edu1'] = unclean['form.ed.edu1'].replace(['---'], '')
unclean['form.ed.highest_edu_level'] = unclean[
    'form.ed.highest_edu_level'].replace(['---'], '')
unclean['edu'] = unclean['form.ed.edu1'].map(
    str) + unclean['form.ed.highest_edu_level']
clean['education'] = unclean['edu'].replace([''], 'unknown')

# Clean Age
unclean['form.dem.age'] = unclean['form.dem.age'].replace(['---'], '')
unclean['form.dem.age_calc'] = unclean['form.dem.age_calc'].replace(['---'],
                                                                    '')
unclean['dob_age'] = np_age(unclean['form.dem.dob'])

unclean['age_concat'] = np_age_concat(unclean['form.dem.age'],
                                      unclean['form.dem.age_calc'])
clean['age'] = np_age_concat(unclean['age_concat'], unclean['dob_age'])

# Clean Night Shfit
unclean['form.jp.night'] = unclean['form.jp.night'].replace(['---'], '')
unclean['form.jp.will_work_night_shift'] = unclean[
    'form.jp.will_work_night_shift'].replace(['---'], '')
clean['night-shift'] = np_concat(unclean['form.jp.night'],
                                 unclean['form.jp.will_work_night_shift'])

unclean['form.cd.first_name'] = unclean['form.cd.first_name'].replace(['---'],
                                                                      '')
unclean['form.cd.fname'] = unclean['form.cd.fname'].replace(['---'], '')
unclean['form.cd.last_name'] = unclean['form.cd.last_name'].replace(['---'],
                                                                    '')
unclean['form.cd.lname'] = unclean['form.cd.lname'].replace(['---'], '')

clean['first-name'] = np_concat(unclean['form.cd.first_name'],
                                unclean['form.cd.fname'])
clean['last-name'] = np_concat(unclean['form.cd.last_name'],
                               unclean['form.cd.lname'])

# Job
unclean['form.int.has_job'] = unclean['form.int.has_job'].replace(['---'], '')
unclean['form.int.job'] = unclean['form.int.job'].replace(['---'], '')
clean['has_job'] = np_concat(unclean['form.int.has_job'], unclean['form.int.job'])

unclean['form.int.has_other_income'] = unclean[
    'form.int.has_other_income'].replace(['---'], '')
unclean['form.int.entrep'] = unclean['form.int.entrep'].replace(['---'], '')
clean['has_other_income'] = np_concat(unclean['form.int.has_other_income'], unclean['form.int.entrep'])

unclean['form.cd.mobile_num'] = unclean['form.cd.mobile_num'].replace(['---'],'')
unclean['form.cd.mob'] = unclean['form.cd.mob'].replace(['---'], '')
clean['mobile_number'] = np_concat(unclean['form.cd.mobile_num'], unclean['form.cd.mob'])

unclean['form.dem.marital_status'] = unclean[
    'form.dem.marital_status'].replace(['---'], '')
unclean['form.dem.marr'] = unclean['form.dem.marr'].replace(['---'], '')

unclean['form.dem.marr'] = unclean['form.dem.marr'].replace(['1'], 'married')
unclean['form.dem.marr'] = unclean['form.dem.marr'].replace(['2'], 'single')
unclean['form.dem.marr'] = unclean['form.dem.marr'].replace(['3'], 'divoriced')
unclean['form.dem.marr'] = unclean['form.dem.marr'].replace(['4'], 'widowed')
clean['marital_status'] = np_concat(unclean['form.dem.marital_status'], unclean['form.dem.marr'])

unclean['form.dem.num_children'] = unclean['form.dem.num_children'].replace(
    ['---'], '')
unclean['form.dem.child'] = unclean['form.dem.child'].replace(['---'], '')
clean['num_children'] = np_concat(unclean['form.dem.num_children'], unclean['form.dem.child'])

unclean['form.tn.num_training_courses'] = unclean[
    'form.tn.num_training_courses'].replace(['---'], '')
unclean['form.tn.tn3'] = unclean['form.tn.tn3'].replace(['---'], '')
clean['num_training_courses'] = np_concat(unclean['form.tn.num_training_courses'],
                                          unclean['form.tn.tn3'])

unclean['form.jp.will_live_in_dorm'] = unclean[
    'form.jp.will_live_in_dorm'].replace(['---'], '')
unclean['form.jp.dorm'] = unclean['form.jp.dorm'].replace(['---'], '')
clean['will_live_in_dorm'] = np_concat(unclean['form.jp.will_live_in_dorm'],
                                       unclean['form.jp.dorm'])

unclean['form.jp.weekly_days_willing_to_work'] = unclean[
    'form.jp.weekly_days_willing_to_work'].replace(['---'], '')
unclean['form.jp.days'] = unclean['form.jp.days'].replace(['---'], '')
clean['weekly_days_willing_to_work'] = np_concat(unclean['form.jp.weekly_days_willing_to_work'],
                                                 unclean['form.jp.days'])

unclean['form.jp.daily_hours_willing_to_work'] = unclean[
    'form.jp.daily_hours_willing_to_work'].replace(['---'], '')
unclean['form.jp.hours'] = unclean['form.jp.hours'].replace(['---'], '')
clean['daily_hours_willing_to_work'] = np_concat(unclean['form.jp.daily_hours_willing_to_work'],
                                                 unclean['form.jp.hours'])

unclean['form.cj.workspace_preference_calc'] = unclean[
    'form.cj.workspace_preference_calc'].replace(['---'], '')
unclean['form.cj.cjsect'] = unclean['form.cj.cjsect'].replace(['---'], '')

unclean['form.cj.cjsect'] = unclean['form.cj.cjsect'].replace(['1'], 'office')
unclean['form.cj.cjsect'] = unclean['form.cj.cjsect'].replace(['2'], 'home')
unclean['form.cj.cjsect'] = unclean['form.cj.cjsect'].replace(['3'], 'souk')
unclean['form.cj.cjsect'] = unclean['form.cj.cjsect'].replace(['4'], 'farm')
unclean['form.cj.cjsect'] = unclean['form.cj.cjsect'].replace(['5'], 'factory')
unclean['form.cj.cjsect'] = unclean['form.cj.cjsect'].replace(['6'], 'mine')
unclean['form.cj.cjsect'] = unclean['form.cj.cjsect'].replace(['7'], 'construction_site')
unclean['form.cj.cjsect'] = unclean['form.cj.cjsect'].replace(['8'], 'garage')
unclean['form.cj.cjsect'] = unclean['form.cj.cjsect'].replace(['9'], 'transport')
unclean['form.cj.cjsect'] = unclean['form.cj.cjsect'].replace(['10'], 'religious_building')
unclean['form.cj.cjsect'] = unclean['form.cj.cjsect'].replace(['11'], 'hospital')
unclean['form.cj.cjsect'] = unclean['form.cj.cjsect'].replace(['12'], 'school')
unclean['form.cj.cjsect'] = unclean['form.cj.cjsect'].replace(['13'], 'hotel')
unclean['form.cj.cjsect'] = unclean['form.cj.cjsect'].replace(['14'], 'restaraunt')

clean['preferred_workspace'] = np_concat(unclean['form.cj.workspace_preference_calc'],
                                      unclean['form.cj.cjsect'])

unclean['form.we.years_exp'] = unclean['form.we.years_exp'].replace(['---'],
                                                                    '')
unclean['form.we.yexp'] = unclean['form.we.yexp'].replace(['---'], '')
clean['years_experience'] = np_concat(unclean['form.we.years_exp'], unclean['form.we.yexp'])

unclean['form.ex.comments'] = unclean['form.ex.comments'].replace(['---'], '')
unclean['form.com2'] = unclean['form.com2'].replace(['---'], '')
clean['comments'] = unclean['form.ex.comments']
clean['eso_comments'] = unclean['form.com2']

unclean['form.ed.comp'] = unclean['form.ed.comp'].replace(['---'], '')

unclean['form.ed.comp'] = unclean['form.ed.comp'].replace(['0'], 'none')
unclean['form.ed.comp'] = unclean['form.ed.comp'].replace(['1'], 'basic')
unclean['form.ed.comp'] = unclean['form.ed.comp'].replace(['2'], 'word_processing')
unclean['form.ed.comp'] = unclean['form.ed.comp'].replace(['3'], 'specialized')
unclean['form.ed.comp'] = unclean['form.ed.comp'].replace(['4'], 'programmer')

unclean['form.ed.computer_exprience'] = unclean[
    'form.ed.computer_exprience'].replace(['---'], '')
clean['computer_experience'] = np_concat(unclean['form.ed.comp'], 
                                           unclean['form.ed.computer_exprience'])

unclean['form.ed.eng'] = unclean['form.ed.eng'].replace(['---'], '')

unclean['form.ed.eng'] = unclean['form.ed.eng'].replace(['0'], 'not_literate')
unclean['form.ed.eng'] = unclean['form.ed.eng'].replace(['1'], 'few_phrases')
unclean['form.ed.eng'] = unclean['form.ed.eng'].replace(['2'], 'simple_phrases')
unclean['form.ed.eng'] = unclean['form.ed.eng'].replace(['3'], 'extended_conversation')
unclean['form.ed.eng'] = unclean['form.ed.eng'].replace(['4'], 'literate')

unclean['form.ed.english_proficiency'] = unclean[
    'form.ed.english_proficiency'].replace(['---'], '')
clean['english_proficiency'] = np_concat(unclean['form.ed.eng'],
                                         unclean['form.ed.english_proficiency'])

unclean['form.jp.opposite_gender_manager'] = unclean[
    'form.jp.opposite_gender_manager'].replace(['---'], '')
unclean['form.jp.oppgen'] = unclean['form.jp.oppgen'].replace(['---'], '')

unclean['form.jp.oppgen'] = unclean['form.jp.oppgen'].replace(['0'], 'will_refuse')
unclean['form.jp.oppgen'] = unclean['form.jp.oppgen'].replace(['1'], 'no_issue')

clean['opposite_gender_manager'] = np_concat(unclean['form.jp.opposite_gender_manager'],
                                             unclean['form.jp.oppgen'])

unclean['form.jp.opposite_gender_coworkers'] = unclean[
    'form.jp.opposite_gender_coworkers'].replace(['---'], '')
unclean['form.jp.oppgen1'] = unclean['form.jp.oppgen1'].replace(['---'], '')
unclean['form.jp.oppgen1'] = unclean['form.jp.oppgen1'].replace(['0'], 'will_refuse')
unclean['form.jp.oppgen1'] = unclean['form.jp.oppgen1'].replace(['1'], 'no_issue')
clean['opposite_gender_coworkers'] = np_concat(unclean['form.jp.opposite_gender_coworkers'],
                                             unclean['form.jp.oppgen1'])

unclean['form.rw.expected_wage'] = unclean['form.rw.expected_wage'].replace(
    ['---'], '')
unclean['form.rw.rwage'] = unclean['form.rw.rwage'].replace(['---'], '')
clean['expected_wage'] = np_concat(unclean['form.rw.expected_wage'], unclean['form.rw.rwage'])


# Job Preference

unclean['form.jp.jp2'] = unclean['form.jp.jp2'].replace(['---'], '')
unclean['form.jp.first_preference'] = unclean['form.jp.first_preference'].replace(['---'], '')
unclean['form.jp.jp2'] = update_profession(unclean['form.jp.jp2'])

clean['first_job_sector_preference'] = np_concat(unclean['form.jp.first_preference'], 
                                                 unclean['form.jp.jp2'])

# Second Preference
unclean['form.jp.jp3'] = unclean['form.jp.jp3'].replace(['---'], '')
unclean['form.jp.second_preference'] = unclean['form.jp.second_preference'].replace(['---'], '')

unclean['form.jp.jp3'] = update_profession(unclean['form.jp.jp3'])

clean['second_job_sector_preference'] = np_concat(unclean['form.jp.second_preference'], 
                                                  unclean['form.jp.jp3'])

unclean['form.ed.major'] = unclean['form.ed.major'].replace(['1'], 'engineering')
unclean['form.ed.major'] = unclean['form.ed.major'].replace(['2'], 'business_econ')
unclean['form.ed.major'] = unclean['form.ed.major'].replace(['3'], 'natural_science')
unclean['form.ed.major'] = unclean['form.ed.major'].replace(['4'], 'social_science')
unclean['form.ed.major'] = unclean['form.ed.major'].replace(['5'], 'medical')
unclean['form.ed.major'] = unclean['form.ed.major'].replace(['6'], 'agriculture')
unclean['form.ed.major'] = unclean['form.ed.major'].replace(['7'], 'education')
unclean['form.ed.major'] = unclean['form.ed.major'].replace(['8'], 'automotive')
unclean['form.ed.major'] = unclean['form.ed.major'].replace(['9'], 'electrical')
unclean['form.ed.major'] = unclean['form.ed.major'].replace(['10'], 'construction')
unclean['form.ed.major'] = unclean['form.ed.major'].replace(['11'], 'it')
unclean['form.ed.major'] = unclean['form.ed.major'].replace(['12'], 'manufacturing')
unclean['form.ed.major'] = unclean['form.ed.major'].replace(['13'], 'architecture')
unclean['form.ed.major'] = unclean['form.ed.major'].replace(['14'], 'carpentry')
unclean['form.ed.major'] = unclean['form.ed.major'].replace(['15'], 'textile')

unclean['form.we.jdesc'] = update_profession(unclean['form.we.jdesc'])
unclean['form.we.jdesc'] = unclean['form.we.jdesc'].replace(['---'], '')
unclean['form.we.job_description'] = unclean['form.we.job_description'].replace(['---'], '')
clean['best_job_field'] = np_concat(unclean['form.we.jdesc'], unclean['form.we.job_description'])

unclean['form.we.jname'] = unclean['form.we.jname'].replace(['---'], '')
unclean['form.we.job_name'] = unclean['form.we.job_name'].replace(['---'], '')
clean['best_job_name'] = np_concat(unclean['form.we.jname'], unclean['form.we.job_name'])

unclean['form.cd.mob1'] = unclean['form.cd.mob1'].replace(['---'], '')
unclean['form.cd.sec_contact_mobile'] = unclean['form.cd.sec_contact_mobile'].replace(['---'], '')
clean['secondary_contact_mobile'] = np_concat(unclean['form.cd.mob1'], unclean['form.cd.sec_contact_mobile'])

unclean['form.id.eso_id'] = unclean['form.id.eso_id'].replace(['---'], '')
unclean['form.id.eid'] = unclean['form.id.eid'].replace(['---'], '')
clean['eso_id'] = np_concat(unclean['form.id.eid'], unclean['form.id.eso_id'])

unclean['form.id.date'] = unclean['form.id.date'].replace(['---'], '')
unclean['form.id.interview_date'] = unclean['form.id.interview_date'].replace(['---'], '')
clean['interview_date'] = np_concat(unclean['form.id.interview_date'], unclean['form.id.date'])

clean['national_id'] = unclean['form.cd.national_id']
clean['moi_id'] = unclean['form.cd.moi']
clean['unhcr_id'] = unclean['form.cd.unhcr_id']

clean['will_work_qiz'] = unclean['form.jp.will_work_qiz']
clean['will_train_unpaid'] = unclean['form.jp.will_train_unpaid']
clean['days_willing_to_train_unpaid'] = unclean['form.jp.days_willing_train_unpaid']

# unclean['form.id.city'] = unclean['form.id.city'].replace(['---'], '')
clean['city'] = map_cities(unclean['form.id.city'])

clean['city']

0      irbid
1      irbid
2      irbid
3      irbid
4      irbid
5      irbid
6      irbid
7      irbid
8      irbid
9      irbid
10     irbid
11     irbid
12     irbid
13     irbid
14     irbid
15     irbid
16     irbid
17     irbid
18     irbid
19     irbid
20     irbid
21     irbid
22     irbid
23     irbid
24     irbid
25     irbid
26     irbid
27     irbid
28     irbid
29     irbid
       ...  
556    irbid
557    irbid
558    irbid
559    irbid
560    irbid
561    irbid
562    irbid
563    irbid
564    irbid
565    irbid
566    amman
567    irbid
568    irbid
569    irbid
570    irbid
571    irbid
572    irbid
573    irbid
574    irbid
575    irbid
576    amman
577    amman
578    amman
579    zarqa
580    amman
581    amman
582    irbid
583    irbid
584    irbid
585    irbid
Name: city, Length: 586, dtype: object

## Translate all English keywords to Arabic

In [6]:
translation = pd.read_csv('../match-data/translations/manual-match-translations-finished.csv')
translated_columns = pd.read_csv('../match-data/translations/manual-match-column-headers-translated.csv')
for index, row in translation.iterrows():
    clean = clean.replace([row['ID']], row['Arabic Translation'])
    

def translate_column(column):
    new_column = translated_columns.loc[translated_columns['ID'] == column]['Arabic Translation']
    return new_column.values[0]
    
    
clean.rename(columns=translate_column , inplace=True)
    
clean

nationality
0    الجنسية
Name: Arabic Translation, dtype: object
gender
1    الجنس
Name: Arabic Translation, dtype: object
education
2    المستوى التعليمي
Name: Arabic Translation, dtype: object
age
3    العمر
Name: Arabic Translation, dtype: object
night-shift
4    قابلية العمل نوبات ليلية
Name: Arabic Translation, dtype: object
first-name
5    الأسم الأول 
Name: Arabic Translation, dtype: object
last-name
6    الأسم الأخير
Name: Arabic Translation, dtype: object
has_job
7    وجود وظيفة حالياً
Name: Arabic Translation, dtype: object
has_other_income
8    وجود دخل آخر حالياً
Name: Arabic Translation, dtype: object
mobile_number
9    رقم الهاتف الجوال
Name: Arabic Translation, dtype: object
marital_status
10    الحالة الاجتماعية
Name: Arabic Translation, dtype: object
num_children
11    عدد الاطفال
Name: Arabic Translation, dtype: object
num_training_courses
12    عدد الدورات التدريبية المتخذة
Name: Arabic Translation, dtype: object
will_live_in_dorm
13    قابلية العيش في سكن موظفين
Nam

Unnamed: 0,الجنسية,الجنس,المستوى التعليمي,العمر,قابلية العمل نوبات ليلية,الأسم الأول,الأسم الأخير,وجود وظيفة حالياً,وجود دخل آخر حالياً,رقم الهاتف الجوال,...,رقم الهاتف المحمول الثانوي,رقم مسؤول خدمات التوظيف,موعد المقابلة,الرقم الوطني,رقم وزارة الداخلية,رقم المفوضية,إذا عرضت عليك وظيفة في شركة تقع في منطقة صناعية مؤهلة مثل منطقة الحسن الصناعية ، هل ستكون على استعداد لقبول هذه الوظيفة؟,إذا عرضت عليك وظيفة طلبت منك حضور تدريب غير مدفوع قبل بدء العمل، فهل ستكون على استعداد لقبول هذه الوظيفة؟,كم من الوقت (بالأيام) ستكون على استعداد على القيام ابلتدريب الغير المدفوع؟,المدينة
0,لاجئ سوري,أنثى,ثانوي,48.0,0,امل,حامد قناطرة,0,0,795216977,...,790526977,1,2018-04-19,---,1.008447e+09,---,---,---,---,اربد
1,لاجئ سوري,ذكر,ثانوي,0.0,1,محمد,قناطره,0,0,795538618,...,798811732,1,2018-04-19,---,1.004443e+09,---,---,---,---,اربد
2,لاجئ سوري,ذكر,ابتدائي,44.0,1,محمد,ابو شناق,0,0,790332061,...,779660713,4,2018-04-23,---,1.002912e+09,---,---,---,---,اربد
3,لاجئ سوري,أنثى,ثانوي,28.0,0,رنا,اللبني,0,0,775998904,...,796596354,1,2018-04-23,---,8.001417e+09,---,---,---,---,اربد
4,لاجئ سوري,أنثى,دبلوم (غير فني),40.0,0,عفاف,العواد,0,0,795457213,...,795954919,4,2018-04-24,---,8.001188e+09,---,---,---,---,اربد
5,لاجئ سوري,أنثى,ابتدائي,0.0,0,مريم,الجهماني,0,0,796684778,...,790236758,1,2018-04-24,---,8.003168e+09,---,---,---,---,اربد
6,لاجئ سوري,أنثى,لا شيء,53.0,0,عزيزة حسين,ابو حصيني,0,0,790695877,...,799589229,9,2018-04-24,---,8.003165e+09,---,---,---,---,اربد
7,لاجئ سوري,أنثى,ثانوي,20.0,0,ولاء,العاسمي,0,0,798972432,...,795761798,2,2018-04-24,---,8.001261e+09,---,---,---,---,اربد
8,لاجئ سوري,أنثى,ثانوي,19.0,0,ايات,طحينه,0,0,781826033,...,795457213,4,2018-04-24,---,8.001188e+09,---,---,---,---,اربد
9,لاجئ سوري,أنثى,بكالوريوس,25.0,0,هيام,القداح,0,0,798385382,...,797934883,2,2018-04-25,---,1.005174e+09,---,---,---,---,اربد


### Lastly, lets save our cleaned data out as a csv

In [7]:
clean.to_csv('../match-data/manual-match/manual_clean.csv')