## Irap & Mona data merge, EDA

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
from scipy.stats import pearsonr
from sklearn.linear_model import LogisticRegression 

In [5]:
path = "Final Soft Launch Results/Current Source Data/"

In [3]:
d1="Data from IRAP 01072020.xlsx"
# d2="Merge for Vulnerability Screening.xlsx"
d3="Raw Data from Mona 01172020.xlsx"

In [None]:
irap = pd.read_excel (path+d1)
# merge = pd.read_excel (path+d2)
mona = pd.read_excel (path+d3)

In [None]:
irap['process_days'] = irap['Date of Screening Decision']-irap['Date of Screening Start']
irap['process_days'] = irap['process_days'].astype('timedelta64[D]')

In [None]:
mona = mona.rename({'Contact UUID': 'Mona Unique User ID'}, axis=1)

#### Joining tables into "full"

In [None]:
full = pd.merge(irap, mona, on='Mona Unique User ID',how="left").dropna(axis=1, how='all')

In [None]:
full['Field:Gender'].replace(('Male', 'Female'), (1, 0), inplace=True)
full['Vulnerability Threshold'].replace(('Above', 'Below'), (1, 0), inplace=True)

#### Create one-hot features

In [None]:
def one_hot(dataframe,column_name):
    gen_onehot_features = pd.get_dummies(dataframe[column_name])
    result = pd.concat([dataframe, gen_onehot_features],axis=1)
    return result

In [None]:
full['Field:Nationality1'] = 'nationality_' + full['Field:Nationality1']
full = one_hot(full,'Field:Nationality1')


In [None]:
full['Field:Current Country'] = 'curr_country_' + full['Field:Current Country']
full = one_hot(full,'Field:Current Country')

In [None]:
full.head()

In [None]:
numeric = full.select_dtypes(exclude=['object','datetime64[ns]'])

In [None]:
multiple_vars = numeric[numeric['Field:Age'].notna() 
        & numeric['Field:Gender'].notna()
        & numeric['Field:Medical Condition Score'].notna()
        & numeric['Vulnerability Threshold'].notna()
        & numeric['Field:LPPN Score'].notna()
        & numeric['process_days'].notna()
       ].dropna(axis=1)

### Part A: Mona’s Vulnerability Scores versus Decision-Making Time:


In [None]:
fig = plt.figure(figsize=[10,5])
plt.scatter(full['Field:Medical Needs Score'], full['process_days'], 
            marker="o",alpha=0.2)

plt.ylim(-2,)
plt.title("Medical Needs")
plt.xlabel("Medical Needs Score")
plt.ylabel("No. of days to process")

plt.show()

In [None]:
fig = plt.figure(figsize=[10,5])
plt.scatter(full['Field:LPPN Score'], full['process_days'], 
            marker="o",alpha=0.2,c="g")

plt.ylim(-2,)
plt.title("LPPN (Legal and Physical Protection Needs)")
plt.xlabel("LPPN Score")
plt.ylabel("No. of days to process")

plt.show()

In [None]:
fig = plt.figure(figsize=[10,2])
plt.scatter(full['Field:LPPN Score'], [1 if i ==1 else 0 for i in full['process_days']], 
            marker="o",alpha=0.2,c="g")

# plt.ylim(-,)
plt.title("LPPN (Legal and Physical Protection Needs)")
plt.xlabel("LPPN Score")
plt.ylabel("Screening finished in one day?")

plt.show()

In [None]:
index = [0,1,2,3,4,5,6]
n_days = [full.loc[(full['Field:LPPN Score'] == i) & (full['process_days'] > 1)].shape[0] for i in index]
one_day = [full.loc[(full['Field:LPPN Score'] == i) & (full['process_days'] == 1)].shape[0] for i in index]
df = pd.DataFrame({'Screening took 1 day': one_day,
                   'Screening took >1 days': n_days}, index=index)
df.plot.bar(rot=0)
plt.xlabel("LPPN Score")
plt.ylabel("Frequencies")

plt.show()

### Part B: Mona’s Vulnerability Scores verses IRAP’s Above/Below Threshold


In [None]:
full['Vulnerability Threshold'].value_counts()

In [None]:
index = [0,1,2,3,4,5,6]
below = [full.loc[(full['Field:Medical Needs Score'] == i) & 
                    (full['Vulnerability Threshold'] ==0)].shape[0] for i in index]
above = [full.loc[(full['Field:Medical Needs Score'] == i) & 
                    (full['Vulnerability Threshold'] == 1)].shape[0] for i in index]
df = pd.DataFrame({'below': below,
                   'above': above}, index=index)
df.plot.bar(rot=0)
plt.xlabel("Medical Needs Score")
plt.ylabel("Frequencies")

plt.show()

In [None]:
index = [0,1,2,3,4,5,6]

below = [full.loc[(full['Field:LPPN Score'] == i) & 
                    (full['Vulnerability Threshold'] == 0)].shape[0] for i in index]
above = [full.loc[(full['Field:LPPN Score'] == i) & 
                    (full['Vulnerability Threshold'] == 1)].shape[0] for i in index]
df = pd.DataFrame({'below': below,
                   'above': above}, index=index)
df.plot.bar(rot=0)
plt.xlabel("LPPN Score")
plt.ylabel("Frequencies")

plt.show()

### Cross tab distribution

In [None]:
pd.crosstab(full['Field:LPPN Score'],full['Vulnerability Threshold'],margins = False)

In [None]:
157+29+14+26+23

In [None]:
full[full['Vulnerability Threshold']==1][['Vulnerability Threshold','Field:Medical Needs Score','Field:LPPN Score']]

In [None]:
pd.crosstab(full['Field:Medical Needs Score'],full['Vulnerability Threshold'],margins = False)

In [None]:
212+6+11+34+59

In [None]:
pd.crosstab(full['Field:LPPN Score'],full['process_days']>1,margins = False)

In [None]:
34+9+2+10+11

In [None]:
pd.crosstab(full['Field:Medical Needs Score'],full['process_days']>1,margins = False)

In [None]:
44+3+5+7+17

### Pearsons correlations

In [None]:
medi_score = full[['Field:Medical Needs Score','process_days']].dropna()
pearsonr(medi_score['Field:Medical Needs Score'],medi_score['process_days'])

In [None]:
def find_pearson_time(var):
    df = full[[var,'process_days']].dropna()
    p = pearsonr(df[var],df['process_days'])
#     print(var)
#     print("PEARSON: {:.5f} p-value: {:.5f}".format(p[0],p[1]))
    return(p[0])
    

In [None]:
def find_pearson_thresh(var):
    df = full[[var,'Vulnerability Threshold']].dropna()
    p = pearsonr(df[var],df['Vulnerability Threshold'])
#     print(var)
#     print("PEARSON: {:.5f} p-value: {:.5f}".format(p[0],p[1]))
    return(p[0])
    

In [None]:
find_pearson_time('Field:Medical Needs Score')

In [None]:
full[' Mona Follow up Needed'].replace(('Yes', 'No'), (1, 0), inplace=True)
full['Mona Case Type Identification'].replace(('Accurate', 'Inaccurate'), (1, 0), inplace=True)
full['Screening Stage'].replace(('Screening Complete', 'Screening'), (1, 0), inplace=True)
full['In IRAP Process'].replace(('Yes', 'No'), (1, 0), inplace=True)
full['First Referral'].replace(('Yes', 'No'), (1, 0), inplace=True)
full["Flags"].fillna("responsive",inplace=True)
full['Flags'].replace(('Unresponsive', 'responsive'), (1, 0), inplace=True)

In [None]:
pearson_time = []
for i in [' Mona Follow up Needed', 'Mona Case Type Identification',
    'Screening Stage','In IRAP Process','First Referral','Flags',
         'Field:Medical Needs Score','Field:LPPN Score']:
    pearson_time.append((find_pearson_time(i),i))
sorted(pearson_time, key=lambda a: abs(a[0]), reverse=True)

In [None]:
pearson_thresh = []
for i in [' Mona Follow up Needed', 'Mona Case Type Identification',
    'Screening Stage','In IRAP Process','First Referral','Flags',
          'Field:Medical Needs Score','Field:LPPN Score']:
    pearson_thresh.append((find_pearson_thresh(i),i))
sorted(pearson_thresh, key=lambda a: abs(a[0]), reverse=True)

In [None]:
full['Screening Stage'].value_counts()

In [None]:
full[' Mona Follow up Needed'].value_counts()

### Regression

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [None]:
drop_na_full = full[[' Mona Follow up Needed', 'Mona Case Type Identification',
    'Screening Stage','In IRAP Process','First Referral','Flags',
          'Field:Medical Needs Score','Field:LPPN Score',"Vulnerability Threshold"]]

In [None]:
drop_na_full = drop_na_full.dropna()

In [None]:
x = drop_na_full[[' Mona Follow up Needed', 'Mona Case Type Identification',
    'Screening Stage','In IRAP Process','First Referral','Flags',
          'Field:Medical Needs Score','Field:LPPN Score']]

In [None]:
y = drop_na_full["Vulnerability Threshold"]

In [None]:
logi = LogisticRegression(solver='lbfgs')
logi.fit(x,y)

#### Logistic

In [None]:
sorted(pearson_time, key=lambda a: abs(a[0]), reverse=True)

In [None]:
sorted(list(zip(logi.coef_[0],x.columns)), key=lambda a: abs(a[0]), reverse=True)

#### Random Forest

In [None]:
rf = RandomForestClassifier()
rf.fit(x,y)
sorted(list(zip(rf.feature_importances_,x.columns)),reverse=True)

### Now on "processing time"

#### Pearsons

In [None]:
corr = full[['Field:Medical Needs Score','Field:LPPN Score']].dropna()
pearsonr(corr['Field:Medical Needs Score'],corr['Field:LPPN Score'])

In [None]:
plt.scatter(corr['Field:Medical Needs Score'],corr['Field:LPPN Score'],alpha=0.1)
plt.show()

In [None]:
corr2 = full[['Field:LPPN Score','process_days']].dropna()
pearsonr(corr2['Field:LPPN Score'],corr2['process_days'])

In [None]:
corr2 = full[['Field:Medical Needs Score','process_days']].dropna()
pearsonr(corr2['Field:Medical Needs Score'],corr2['process_days'])

In [None]:
corr2['Field:Medical Needs Score']

In [None]:
plt.scatter(corr2['Field:Medical Needs Score'],corr2['process_days'],alpha=0.1)
plt.ylim(-2,)
plt.show()

In [None]:
def find_pearson_time(var,df_name):
    df = df_name[[var,'process_days']].dropna()
    p = pearsonr(df[var],df['process_days'])
#     print(var)
#     print("PEARSON: {:.5f} p-value: {:.5f}".format(p[0],p[1]))
    return(p[0])
    

In [None]:
def find_pearson_thresh(var,df_name):
    df = df_name[[var,'Vulnerability Threshold']].dropna()
    p = pearsonr(df[var],df['Vulnerability Threshold'])
#     print(var)
#     print("PEARSON: {:.5f} p-value: {:.5f}".format(p[0],p[1]))
    return(p[0])
    

In [None]:
find_pearson_time('Field:Medical Needs Score',multiple_vars)

In [None]:
multiple_vars.columns

In [None]:
pearson_time = []
for i in ['Vulnerability Threshold', 'Field:Age', 'Field:Gender',
       'Field:LPPN Score', 'Field:Medical Condition Assessment',
       'Field:Medical Condition Score', 'Field:Medical Condition 1 Score',
       'Field:Medical Condition 2 Score', 'Field:Medical Condition 3 Score',
       'Field:Medical Needs Score', 'Field:Years in Current Country',
       'nationality_Iraqi', 'nationality_Syrian', 'curr_country_Egypt',
       'curr_country_Iraq', 'curr_country_Lebanon',
       'curr_country_Saudi Arabia', 'curr_country_Syria',
       'curr_country_Turkey']:
    pearson_time.append((find_pearson_time(i,multiple_vars),i))
sorted(pearson_time, key=lambda a: abs(a[0]), reverse=True)

In [None]:
pearson_thresh = []
for i in ['process_days', 'Field:Age', 'Field:Gender',
       'Field:LPPN Score', 'Field:Medical Condition Assessment',
       'Field:Medical Condition Score', 'Field:Medical Condition 1 Score',
       'Field:Medical Condition 2 Score', 'Field:Medical Condition 3 Score',
       'Field:Medical Needs Score', 'Field:Years in Current Country',
       'nationality_Iraqi', 'nationality_Syrian', 'curr_country_Egypt',
       'curr_country_Iraq', 'curr_country_Lebanon',
       'curr_country_Saudi Arabia', 'curr_country_Syria',
       'curr_country_Turkey']:
    pearson_thresh.append((find_pearson_thresh(i,multiple_vars),i))
sorted(pearson_thresh, key=lambda a: abs(a[0]), reverse=True)

#### Regression

In [None]:
x = multiple_vars.drop('Vulnerability Threshold',axis=1)

In [None]:
y = multiple_vars['Vulnerability Threshold']

In [None]:
x2 = multiple_vars.drop('process_days',axis=1)

In [None]:
y2 = multiple_vars['process_days']

In [None]:
logi = LogisticRegression(solver='lbfgs',max_iter=10000)
logi.fit(x,y)
logi2 = LogisticRegression(solver='lbfgs',max_iter=10000)
logi2.fit(x2,y2)

#### Logistic (vulnerability)

In [None]:
sorted(list(zip(logi.coef_[0],x.columns)), key=lambda a: abs(a[0]), reverse=True)

#### Logistic (process days)

In [None]:
sorted(list(zip(logi2.coef_[0],x2.columns)), key=lambda a: abs(a[0]), reverse=True)

#### Random Forest  (vulnerability)

In [None]:
rf = RandomForestClassifier()
rf.fit(x,y)
sorted(list(zip(rf.feature_importances_,x.columns)),reverse=True)

#### Random Forest  (process days)

In [None]:
rf = RandomForestClassifier()
rf.fit(x2,y2)
sorted(list(zip(rf.feature_importances_,x.columns)),reverse=True)