In [2]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from joblib import dump, load
import matplotlib.pyplot as plt
import re
import numpy as np

In [3]:
df = pd.read_stata('./NAF Clean Set.dta', )[['Experience_NAF','employment_status','Gender','Gov_NAF','Educ_NAF','Disability_NAF','Age_NAF']]
df.columns = ['experience','employment', 'gender', 'governorate', 'education', 'disability', 'age']
df = df.replace(r'^\s*$', np.nan, regex=True)
df.head()

Unnamed: 0,experience,employment,gender,governorate,education,disability,age
0,0.0,Unemployed,Male,Al Kirk,Secondary or Below,No Disability,41.754963
1,3257.0,Formal worker,Male,Tafileh,Secondary or Below,No Disability,36.320328
2,0.0,Unemployed,Female,Al Kirk,Secondary or Below,No Disability,55.915127
3,0.0,Unemployed,Male,Al Kirk,Secondary or Below,No Disability,26.57358
4,0.0,Daily worker,Male,Zarqa,Secondary or Below,No Disability,47.082821


In [4]:
# unify dealing with strings data
for col in df.select_dtypes(include=['object']):
    df[col] = df[col].str.replace(' ', '_')
    df[col] = df[col].str.replace('-', '_').str.lower()
df.head()

Unnamed: 0,experience,employment,gender,governorate,education,disability,age
0,0.0,unemployed,male,al_kirk,secondary_or_below,no_disability,41.754963
1,3257.0,formal_worker,male,tafileh,secondary_or_below,no_disability,36.320328
2,0.0,unemployed,female,al_kirk,secondary_or_below,no_disability,55.915127
3,0.0,unemployed,male,al_kirk,secondary_or_below,no_disability,26.57358
4,0.0,daily_worker,male,zarqa,secondary_or_below,no_disability,47.082821


In [8]:
for col in df.select_dtypes(include=['object']):
    print(df[col].unique())

['unemployed' 'formal_worker' 'daily_worker' 'informal_worker' 'housewife'
 'self_employed']
['male' 'female']
['al_kirk' 'tafileh' 'zarqa' 'ajloun' 'al_mafraq' 'irbid' 'balqa' 'amman'
 'maadaba' 'jarash' 'maan' 'al_aqaba']
['secondary_or_below' 'middle_diploma' 'bachelor_or_above']
['no_disability' 'with_disability']


In [6]:
df.disability = df.disability.fillna('no_disability')
df.education = df.education.fillna('secondary_or_below')
df.employment = df.employment.fillna('unemployed')

In [7]:
df.education[df.education == 'bachelor'] = 'bachelor_or_above'
df.education[df.education == 'masters'] = 'bachelor_or_above'
df.education[df.education == 'phd'] = 'bachelor_or_above'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.education[df.education == 'bachelor'] = 'bachelor_or_above'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.education[df.education == 'masters'] = 'bachelor_or_above'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.education[df.education == 'phd'] = 'bachelor_or_above'


In [9]:
def experience_code(value):
    if value > 15:
        return 20
    elif value > 10:
        return 15
    elif value > 5:
        return 10
    elif value > 1:
        return 5
    elif value > 0:
        return 1
    else:
        return 0

df.experience = np.round(df.experience / 365)
df.experience = df.experience.apply(experience_code)
df.experience.unique()

array([ 0, 10,  1,  5, 15, 20], dtype=int64)

In [10]:
df.age = (np.round(df.age/10)*10).astype(int)

In [11]:
df.head()

Unnamed: 0,experience,employment,gender,governorate,education,disability,age
0,0,unemployed,male,al_kirk,secondary_or_below,no_disability,40
1,10,formal_worker,male,tafileh,secondary_or_below,no_disability,40
2,0,unemployed,female,al_kirk,secondary_or_below,no_disability,60
3,0,unemployed,male,al_kirk,secondary_or_below,no_disability,30
4,0,daily_worker,male,zarqa,secondary_or_below,no_disability,50


In [12]:
df.drop('employment', axis=1).to_csv('stacked_results.csv', index=False)

In [13]:
df = pd.read_stata('./Unemployment Data.dta', )[['experience', 'Governorate', 'Name_tr', 'Disabled_tr', 'EducationalAttainment', 'JobSeekers_DateOfBirth', 'job_search_start']]
df.head()

Unnamed: 0,experience,Governorate,Name_tr,Disabled_tr,EducationalAttainment,JobSeekers_DateOfBirth,job_search_start
0,0.0,Irbid,Male,No Disability,Secondary or Below,1999-12-10,NaT
1,0.0,Maan,Male,No Disability,Secondary or Below,1999-12-30,2019-02-25
2,0.0,Irbid,Male,No Disability,Secondary or Below,1999-12-25,2019-03-17
3,276.0,Irbid,Male,No Disability,Secondary or Below,1999-12-25,2020-02-01
4,307.0,Irbid,Male,No Disability,Secondary or Below,1999-12-25,2020-04-01


In [14]:
df.columns = ['experience', 'governorate', 'gender', 'disability', 'education', 'birth_date', 'job_search_start']
df = df.replace(r'^\s*$', np.nan, regex=True)
df.tail()

Unnamed: 0,experience,governorate,gender,disability,education,birth_date,job_search_start
308518,183.0,Al Kirk,Female,No Disability,Secondary or Below,1999-12-16,2020-04-01
308519,213.0,Al Kirk,Female,No Disability,Secondary or Below,1999-12-16,2020-05-01
308520,244.0,Al Kirk,Female,No Disability,Secondary or Below,1999-12-16,2020-06-01
308521,274.0,Al Kirk,Female,No Disability,Secondary or Below,1999-12-16,2020-07-01
308522,,Irbid,Female,No Disability,Secondary or Below,1999-12-03,2018-04-16


In [15]:
# unify dealing with strings data
for col in df.select_dtypes(include=['object']):
    df[col] = df[col].str.replace(' ', '_')
    df[col] = df[col].str.replace('-', '_').str.lower()
df.head()

Unnamed: 0,experience,governorate,gender,disability,education,birth_date,job_search_start
0,0.0,irbid,male,no_disability,secondary_or_below,1999-12-10,NaT
1,0.0,maan,male,no_disability,secondary_or_below,1999-12-30,2019-02-25
2,0.0,irbid,male,no_disability,secondary_or_below,1999-12-25,2019-03-17
3,276.0,irbid,male,no_disability,secondary_or_below,1999-12-25,2020-02-01
4,307.0,irbid,male,no_disability,secondary_or_below,1999-12-25,2020-04-01


In [21]:
for col in df.select_dtypes(include=['object']):
    print(df[col].unique())

['irbid' 'maan' 'al_kirk' 'zarqa' 'ajloun' 'amman' 'maadaba' 'al_aqaba'
 'tafileh' 'balqa' 'al_mafraq' 'jarash']
['male' 'female']
['no_disability' 'with_disability']
['secondary_or_below' 'vocational_training' 'middle_diploma'
 'bachelor_or_above']


In [20]:
# df.disability = df.disability.fillna('no_disability')
df.education = df.education.fillna('secondary_or_below')
# df.employment = df.employment.fillna('unemployed')
df = df[df.governorate != 'outside_jordan']

In [18]:
df.education[df.education == 'bachelor'] = 'bachelor_or_above'
df.education[df.education == 'masters'] = 'bachelor_or_above'
df.education[df.education == 'phd'] = 'bachelor_or_above'
df.education[df.education == 'high_diploma'] = 'bachelor_or_above'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.education[df.education == 'bachelor'] = 'bachelor_or_above'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.education[df.education == 'masters'] = 'bachelor_or_above'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.education[df.education == 'phd'] = 'bachelor_or_above'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-v

In [22]:
df.head()

Unnamed: 0,experience,governorate,gender,disability,education,birth_date,job_search_start
0,0.0,irbid,male,no_disability,secondary_or_below,1999-12-10,NaT
1,0.0,maan,male,no_disability,secondary_or_below,1999-12-30,2019-02-25
2,0.0,irbid,male,no_disability,secondary_or_below,1999-12-25,2019-03-17
3,276.0,irbid,male,no_disability,secondary_or_below,1999-12-25,2020-02-01
4,307.0,irbid,male,no_disability,secondary_or_below,1999-12-25,2020-04-01


In [23]:
def experience_code(value):
    if value > 15:
        return 20
    elif value > 10:
        return 15
    elif value > 5:
        return 10
    elif value > 1:
        return 5
    elif value > 0:
        return 1
    else:
        return 0

df.experience = np.round(df.experience / 365)
df.experience = df.experience.apply(experience_code)
df.experience.unique()

array([ 0,  1,  5, 10, 15, 20], dtype=int64)

In [24]:
df.head()

Unnamed: 0,experience,governorate,gender,disability,education,birth_date,job_search_start
0,0,irbid,male,no_disability,secondary_or_below,1999-12-10,NaT
1,0,maan,male,no_disability,secondary_or_below,1999-12-30,2019-02-25
2,0,irbid,male,no_disability,secondary_or_below,1999-12-25,2019-03-17
3,1,irbid,male,no_disability,secondary_or_below,1999-12-25,2020-02-01
4,1,irbid,male,no_disability,secondary_or_below,1999-12-25,2020-04-01


In [20]:
# dataS$Age = factor(10 * round((dataS$job_search_start - dataS$JobSeekers_DateOfBirth)/3650, 0)) # Measured in decades.

In [25]:
df.birth_date = pd.to_datetime(df.birth_date)
df.job_search_start = pd.to_datetime(df.job_search_start)
df = df.dropna(subset=['job_search_start','birth_date'])

In [26]:
df.head()

Unnamed: 0,experience,governorate,gender,disability,education,birth_date,job_search_start
1,0,maan,male,no_disability,secondary_or_below,1999-12-30,2019-02-25
2,0,irbid,male,no_disability,secondary_or_below,1999-12-25,2019-03-17
3,1,irbid,male,no_disability,secondary_or_below,1999-12-25,2020-02-01
4,1,irbid,male,no_disability,secondary_or_below,1999-12-25,2020-04-01
5,1,irbid,male,no_disability,secondary_or_below,1999-12-25,2020-07-01


In [27]:
import datetime as dt
df.birth_date = (df.birth_date - dt.datetime(1970,1,1)).dt.total_seconds().astype(int)
df.job_search_start = (df.job_search_start - dt.datetime(1970,1,1)).dt.total_seconds().astype(int)

In [28]:
df.head()

Unnamed: 0,experience,governorate,gender,disability,education,birth_date,job_search_start
1,0,maan,male,no_disability,secondary_or_below,946512000,1551052800
2,0,irbid,male,no_disability,secondary_or_below,946080000,1552780800
3,1,irbid,male,no_disability,secondary_or_below,946080000,1580515200
4,1,irbid,male,no_disability,secondary_or_below,946080000,1585699200
5,1,irbid,male,no_disability,secondary_or_below,946080000,1593561600


In [29]:
df['age'] = df.job_search_start - df.birth_date
df.head()

Unnamed: 0,experience,governorate,gender,disability,education,birth_date,job_search_start,age
1,0,maan,male,no_disability,secondary_or_below,946512000,1551052800,604540800
2,0,irbid,male,no_disability,secondary_or_below,946080000,1552780800,606700800
3,1,irbid,male,no_disability,secondary_or_below,946080000,1580515200,634435200
4,1,irbid,male,no_disability,secondary_or_below,946080000,1585699200,639619200
5,1,irbid,male,no_disability,secondary_or_below,946080000,1593561600,647481600


In [30]:
df['age'] = (np.round((df['age'] / (60 * 60 * 24 * 365)) / 10) * 10).astype(int)

In [31]:
df.head()

Unnamed: 0,experience,governorate,gender,disability,education,birth_date,job_search_start,age
1,0,maan,male,no_disability,secondary_or_below,946512000,1551052800,20
2,0,irbid,male,no_disability,secondary_or_below,946080000,1552780800,20
3,1,irbid,male,no_disability,secondary_or_below,946080000,1580515200,20
4,1,irbid,male,no_disability,secondary_or_below,946080000,1585699200,20
5,1,irbid,male,no_disability,secondary_or_below,946080000,1593561600,20


In [32]:
print(df.info())
df = df[df.age <= 60]
df = df[df.age >= 10]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 264432 entries, 1 to 308522
Data columns (total 8 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   experience        264432 non-null  int64 
 1   governorate       264432 non-null  object
 2   gender            264432 non-null  object
 3   disability        264432 non-null  object
 4   education         264432 non-null  object
 5   birth_date        264432 non-null  int32 
 6   job_search_start  264432 non-null  int32 
 7   age               264432 non-null  int32 
dtypes: int32(3), int64(1), object(4)
memory usage: 15.1+ MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 264415 entries, 1 to 308522
Data columns (total 8 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   experience        264415 non-null  int64 
 1   governorate       264415 non-null  object
 2   gender            264415 non-null  object
 3   disa

In [34]:
df.head()

Unnamed: 0,experience,governorate,gender,disability,education,birth_date,job_search_start,age
1,0,maan,male,no_disability,secondary_or_below,946512000,1551052800,20
2,0,irbid,male,no_disability,secondary_or_below,946080000,1552780800,20
3,1,irbid,male,no_disability,secondary_or_below,946080000,1580515200,20
4,1,irbid,male,no_disability,secondary_or_below,946080000,1585699200,20
5,1,irbid,male,no_disability,secondary_or_below,946080000,1593561600,20


In [35]:
df.drop(['birth_date', 'job_search_start'], axis=1, inplace=True)
df[['experience', 'gender', 'governorate', 'education', 'disability', 'age']].to_csv('stacked_results.csv', index=False, mode='a', header=False)

In [31]:
dfr = pd.read_csv('stacked_results.csv')
dfr.head()

Unnamed: 0,experience,gender,governorate,education,disability,age
0,0,male,al_kirk,secondary_or_below,no_disability,40
1,10,male,tafileh,secondary_or_below,no_disability,40
2,0,female,al_kirk,secondary_or_below,no_disability,60
3,0,male,al_kirk,secondary_or_below,no_disability,30
4,0,male,zarqa,secondary_or_below,no_disability,50


In [32]:
dfr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 274407 entries, 0 to 274406
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   experience   274407 non-null  int64 
 1   gender       274407 non-null  object
 2   governorate  274407 non-null  object
 3   education    274407 non-null  object
 4   disability   274407 non-null  object
 5   age          274407 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 12.6+ MB


In [1]:
# dfr.experience = dfr.experience.astype(int)
# dfr.age = dfr.age.astype(int)
# dfr.info()