## Cleaned data modelling

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
from scipy.stats import pearsonr
from sklearn.linear_model import LogisticRegression 
import statsmodels.api as sm
from sklearn.ensemble import RandomForestClassifier


In [4]:
data = pd.read_csv("cleaned_data.csv")
data = data.drop(columns=['Unnamed: 0','Program Eligibilities','Client Location (Country)','Client Nationality'])

In [5]:
print(data.shape)
data.head()

(7655, 123)


Unnamed: 0,Vulnerability Threshold,Field:Aggr 3 Threats Since Moving,Field:RFR Submission,Field:US Prior Admission,Field:Aggr 3 Moved to Avoid Aggr,Field:Family Worked for USG,ILI Assistance Provided,Field:RFR,Field:Aggr 2 Moved to Avoid Aggr,Flags,...,process_day_>=1,process_day_>=7,process_days,Medical_X_LPPN,Field:LPPN Score,Screening Decision,Date of Screening Start,Date of Referral,First Referral,In IRAP Process
0,Above,No,No,No,No,No,No,No,No,No,...,0.0,0.0,0.0,,,Cannot Represent,2019-08-12 00:00:00,2019-07-25 00:00:00,No,No
1,Below,No,No,No,No,No,No,No,No,No,...,,,,,,,,2019-08-03 00:00:00,No,Yes
2,Below,No,No,No,No,No,No,No,No,No,...,,,,,,,2019-11-08 00:00:00,2019-08-08 00:00:00,Yes,No
3,Below,No,No,No,No,No,No,No,Yes,No,...,,,,0.0,6.0,,,2019-08-02 00:00:00,No,No
4,Above,No,No,No,No,No,No,No,No,No,...,0.0,0.0,0.0,0.0,0.0,Cannot Represent,2019-12-20 00:00:00,2019-08-02 00:00:00,No,No


In [6]:
train_data = data.dropna(subset=['process_days'])

#### Examine null counts

In [7]:
null_count = pd.Series.to_frame(train_data.isna().sum())
# null_count.rename(columns={"0": "null_count"})
null_count.columns = ["null_counts"]
null_count["total_counts"] = train_data.shape[0]
null_count["null_percent"] = np.round(null_count["null_counts"] / train_data.shape[0] , 5)
null_count["dtype"] = pd.Series.to_frame(train_data.dtypes)
null_count["sample_value"] = train_data.mode()[:1].T
null_count.sort_values(['null_counts'],ascending = False)[:20]

Unnamed: 0,null_counts,total_counts,null_percent,dtype,sample_value
Field:Handoff Timestamp,523,786,0.66539,object,2020-01-14 13:06:09.200927+02:00
Medical_X_LPPN,428,786,0.54453,float64,0
Field:LPPN Score,428,786,0.54453,float64,0
Field:Current Country,185,786,0.23537,object,Lebanon
Field:Nationality1,176,786,0.22392,object,Syrian
Field:Gender,175,786,0.22265,object,Male
Created On,170,786,0.21628,object,2019-07-27 23:44:20
Language,170,786,0.21628,object,ara
process_day_>=1,8,786,0.01018,float64,0
Screening Decision,5,786,0.00636,object,Cannot Represent


In [8]:
train_data.shape

(786, 123)

In [9]:
train_data = train_data.drop(train_data.columns[train_data.isnull().mean()>0.5], axis=1).dropna()
train_data.shape

(586, 120)

In [10]:
train_data.replace(dict.fromkeys(['Yes','Eligible','Above'], 1),inplace=True)
train_data.replace(dict.fromkeys(['No','Not Eligible','Below','Not Applicable'], 0),inplace=True)

train_data.head()

Unnamed: 0,Vulnerability Threshold,Field:Aggr 3 Threats Since Moving,Field:RFR Submission,Field:US Prior Admission,Field:Aggr 3 Moved to Avoid Aggr,Field:Family Worked for USG,ILI Assistance Provided,Field:RFR,Field:Aggr 2 Moved to Avoid Aggr,Flags,...,Field:Worked for USG,Field:Aggr 1 Mst Svr Only Threats,process_day_>=1,process_day_>=7,process_days,Screening Decision,Date of Screening Start,Date of Referral,First Referral,In IRAP Process
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0.0,0.0,0.0,Cannot Represent,2019-08-12 00:00:00,2019-07-25 00:00:00,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0.0,0.0,0.0,Cannot Represent,2019-12-20 00:00:00,2019-08-02 00:00:00,0,0
5,1,0,0,0,0,0,1,0,0,0,...,0,0,1.0,1.0,139.0,Cannot Represent,2019-07-31 00:00:00,2019-07-26 00:00:00,0,0
6,1,0,0,0,0,0,0,0,0,0,...,0,0,0.0,0.0,0.0,Cannot Represent,2019-12-16 00:00:00,2019-08-01 00:00:00,0,0
10,1,0,0,0,0,0,0,0,0,0,...,0,0,0.0,0.0,0.0,Cannot Represent,2019-11-14 00:00:00,2019-08-05 00:00:00,0,0


In [11]:
for i in train_data.columns:
#     print(i)
    print(train_data[i].value_counts())

1    521
0     65
Name: Vulnerability Threshold, dtype: int64
0    578
1      8
Name: Field:Aggr 3 Threats Since Moving, dtype: int64
0    583
1      3
Name: Field:RFR Submission, dtype: int64
0    580
1      6
Name: Field:US Prior Admission, dtype: int64
0    575
1     11
Name: Field:Aggr 3 Moved to Avoid Aggr, dtype: int64
0    582
1      4
Name: Field:Family Worked for USG, dtype: int64
0    548
1     38
Name: ILI Assistance Provided , dtype: int64
0    580
1      6
Name: Field:RFR, dtype: int64
0    543
1     43
Name: Field:Aggr 2 Moved to Avoid Aggr, dtype: int64
0    529
1     57
Name: Flags, dtype: int64
0    537
1     49
Name: Field:Aggr 1 Threats Since Moving, dtype: int64
0    572
1     14
Name: Field:USRAP Access, dtype: int64
0    463
1    123
Name: Field:Aggr 1 Moved to Avoid Aggr, dtype: int64
0    407
1    179
Name: Field:Multiple Incidents, dtype: int64
0    357
1    229
Name: Field:Vulnerability Assessed, dtype: int64
0    579
1      7
Name: URN:Telegram, dtype: int64


In [12]:
for col in train_data.columns:
    if len(train_data[col].unique()) == 1:
        train_data.drop(col,inplace=True,axis=1)

In [13]:
x = pd.Series.to_frame(train_data.dtypes)

In [14]:
# Convert string to datetime
for i in train_data.columns:
    if (train_data[i].dtype) == "object":
        try:
            train_data[i] = train_data[i].astype('datetime64')
            print("converted: "+i)
        except:
            pass

converted: Date of Screening Decision
converted: Created On
converted: Date of Screening Start
converted: Date of Referral


In [15]:
# Convert datetime to ordinal
for i in train_data.columns:
    if (train_data[i].dtype) == "datetime64[ns]":
        try:
            train_data[i] = train_data[i].apply(lambda x: x.toordinal())
            print("converted: "+i)
        except:
            pass

converted: Date of Screening Decision
converted: Created On
converted: Date of Screening Start
converted: Date of Referral


In [16]:
def one_hot(dataframe,column_name):
    gen_onehot_features = pd.get_dummies(dataframe[column_name])
    gen_onehot_features.columns = column_name+"::"+gen_onehot_features.columns
    result = pd.concat([dataframe, gen_onehot_features],axis=1)
    return result

In [17]:
# One hot categorical
# for i in train_data.columns:
#     if (train_data[i].dtype) == "object":
#         print(i)
#         train_data = one_hot(train_data,i)

In [18]:
train_data.shape

(586, 111)

In [20]:
for i in train_data.columns:
    if (train_data[i].dtype) == "object":
        train_data = train_data.drop(columns=[i])

#### Exclude irap vars

In [23]:
path = "../Final Soft Launch Results/Current Source Data/"
d1="Data from IRAP 01072020.xlsx"

In [24]:
irap = pd.read_excel (path+d1)
irap.head()

Unnamed: 0,Mona Unique Session ID,Mona Unique User ID,Created on,Created by,Pipeline,Date of Referral,First Referral,In IRAP Process,Screening Stage,Screener,...,Legal Assistance Identified,ILI Assistance Provided,Mona Case Type Identification,Mona Follow up Needed,Vulnerability Threshold,Screening Decision,Date of Screening Decision,Flags,Client Location (Country),Client Nationality
0,ce81140c-7083-4463-90ed-ad36d1a41011.2019-07-2...,ce81140c-7083-4463-90ed-ad36d1a41011,2019-12-11 06:02:17,Brooke Sauro,Mona,2019-07-25,No,No,Screening Complete,Michaela Gallien,...,,,Accurate,No,Below,Cannot Represent,2019-08-12,,,
1,92952aa434524f549543d3fcc79054fe20190804T02434...,92952aa4-3452-4f54-9543-d3fcc79054fe,2019-12-11 06:02:17,Brooke Sauro,Mona,2019-08-03,No,Yes,Screening Complete,Manal ElKhoury,...,Vulnerability RST Referral,,,,,,NaT,,,
2,8a4a87a6eeeb43019d820644a8a7534620190802T07350...,8a4a87a6-eeeb-4301-9d82-0644a8a75346,2019-12-11 06:02:17,Brooke Sauro,Mona,2019-08-02,No,No,Screening,Tiba Fatli,...,Vulnerability RST Referral,,,,,,NaT,,,
3,758240c9578a4e29b66bd1d95de9cdc620190802T21293...,758240c9-578a-4e29-b66b-d1d95de9cdc6,2019-12-11 06:02:17,Brooke Sauro,Mona,2019-08-02,No,No,Screening Complete,Tania El Khoury,...,Vulnerability RST Referral,,Accurate,No,Below,Cannot Represent,2019-12-20,,,
4,55865e62-97b3-4656-b470-7eaaa6a517cf.2019-07-2...,55865e62-97b3-4656-b470-7eaaa6a517cf,2019-12-11 06:02:16,Brooke Sauro,Mona,2019-07-26,No,No,Screening Complete,Tania El Khoury,...,Vulnerability RST Referral,Non-Legal Services Information,Accurate,Yes,Below,Cannot Represent,2019-12-17,,,


In [28]:
for i in irap.columns:
#     print(i)
    if i in train_data.columns:
        print(i)
        train_data = train_data.drop(columns=[i])

Date of Referral
First Referral
In IRAP Process
Date of Screening Start
ILI Assistance Provided 
Vulnerability Threshold
Date of Screening Decision
Flags


### Ready for training

In [31]:
x = train_data.drop(columns=['process_day_>=1','process_day_>=7','process_days'])
# x.head()

In [39]:
y = train_data["process_day_>=1"]
# y.head()

In [40]:
clf = LogisticRegression().fit(x,y)

In [41]:
sorted(list(zip(clf.coef_[0],x.columns)), key=lambda a: abs(a[0]), reverse=True)
# sorted(pearson_time, key=lambda a: abs(a[0]), reverse=True)

[(-1.125073562578991e-06, 'Created On'),
 (-2.0835917689939266e-12, 'Field:Vulnerability Assessed'),
 (-2.0714806847309882e-12, 'Field:Lebanon UNHCR Info Rating'),
 (-1.8386972615782218e-12, 'Field:Lack of Durable Solutions'),
 (-1.6802945741512857e-12, 'Married'),
 (-1.5862311254629261e-12, 'URN:Facebook'),
 (-1.5560306345761639e-12, 'Field:Age'),
 (-1.4964178276351833e-12, 'Field:End Feedback Helpfulness Rating'),
 (-1.4961032462623253e-12, 'Adult'),
 (-1.4767715338006561e-12, 'Field:Women and Girls at Risk'),
 (-1.317894954286549e-12, 'Field:End Feedback Recommend Rating'),
 (1.296707078821173e-12, 'Field:Multiple Incidents'),
 (1.232211585714559e-12, 'Field:Medical Condition 1 SVT'),
 (1.2090980814137806e-12, 'Field:Aggr 1 SVT Rating'),
 (1.179055917568286e-12, 'Field:Aggressor 1 Score'),
 (-1.1720948261656898e-12, 'Field:SVT'),
 (-1.1720948261656898e-12, 'Field:LGBT'),
 (-1.1720948261656898e-12, 'Field:Years in Current Country'),
 (-1.1720948261656898e-12, 'Field:Medical Needs Sco

In [42]:
from sklearn.feature_selection import f_classif, chi2, mutual_info_classif, SelectKBest


In [43]:
def get_feat(method, k, x, y):
    select = SelectKBest(method, k)
    _ = select.fit_transform(x, y)
    return [x.columns[i] for i, val in enumerate(select.get_support()) if val] 

In [47]:
num_features = 10
chi2_select = get_feat(chi2, num_features, x, y)
f_select = get_feat(f_classif, num_features, x, y)
# mutual_select = get_feat(mutual_info_classif, num_features, x, y)


In [48]:
for i in [chi2_select,f_select]:
    process_day7=sm.Logit(y,x[i]).fit(method="bfgs",maxiter=1000)
    print(process_day7.summary())

Optimization terminated successfully.
         Current function value: 0.661352
         Iterations: 73
         Function evaluations: 74
         Gradient evaluations: 74
                           Logit Regression Results                           
Dep. Variable:        process_day_>=1   No. Observations:                  586
Model:                          Logit   Df Residuals:                      576
Method:                           MLE   Df Model:                            9
Date:                Thu, 21 May 2020   Pseudo R-squ.:                -0.07710
Time:                        07:44:23   Log-Likelihood:                -387.55
converged:                       True   LL-Null:                       -359.81
Covariance Type:            nonrobust   LLR p-value:                     1.000
                                               coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------

### Variables that affect process time

Field:Aggr 2 Threat Within Last 6 Months - We changed this to a null/not null due to high % of missing data, so person who undergoes threat is more likely to result in >1 day to process.

Field:Aggr 2 SVT Rating - We changed this to a null/not null due to high % of missing data, so person with a rating is less likely to result in >1 day to process.

Field:Aggr 2 Reason Unable to Move - same as above, if there is a reason unable to move, more likely to result in >1 day to process.

Field:Medical Condition 1 SVT - (Awaiting data re-cleanup)Yes --> morelikely to result in >1 day to process.

Field:Medical Condition 1 Score - If not null, less likely to result in >1 day to process.

In [50]:
import pprint

In [51]:
for i in [chi2_select,f_select]:
    model = RandomForestClassifier(max_depth=10,n_estimators = 64,criterion='gini')
    model.fit(x[i],y)
    pprint.pprint(sorted(list(zip(model.feature_importances_,x[i].columns)), reverse=True))

[(0.1887672203210171, 'Field:Aggr 2 Threat Within Last 6 Months'),
 (0.1482449421383513, 'Field:Medical Condition 1 SVT'),
 (0.14403429183925281, 'Field:Multiple Incidents'),
 (0.09946901077002876, 'Field:Aggr 1 Mst Svr Attack'),
 (0.08430916710342437, 'Field:Aggr 2 SVT Rating'),
 (0.0814152694933046, 'Field:Aggr 2 Mst Svr Attack'),
 (0.07477247133869974, 'Field:RFR'),
 (0.0734542837210444, 'Field:Aggressor 1 Score'),
 (0.05857090862161885, 'Field:Aggr 1 SVT Rating'),
 (0.04696243465325816, 'Field:Aggr 2 Reason Unable to Move')]
[(0.14521733944820175, 'Field:Aggr 2 Threat Within Last 6 Months'),
 (0.13565843340115755, 'Field:Multiple Incidents'),
 (0.1354995616138294, 'Field:Medical Condition 1 SVT'),
 (0.11391934408143892, 'Field:Medical Condition 1 Score'),
 (0.10423477792697387, 'Field:Aggr 1 Mst Svr Attack'),
 (0.09182309675084264, 'Field:Aggr 2 Mst Svr Attack'),
 (0.07899012748800693, 'Field:Aggr 2 SVT Rating'),
 (0.07659789923112129, 'Field:Aggressor 1 Score'),
 (0.06881627190264

### Irap new data format investigation

In [55]:
Podio = pd.read_excel('20200430 Podio ILI.xlsx')
Zendesk = pd.read_excel('20200511 Zendesk ILI.xlsx')

In [56]:
Podio.head()

Unnamed: 0,Mona Unique Session ID,Mona Unique User ID,Created on,Created by,Pipeline,Date of Referral,First Referral,In IRAP Process,Screening Stage,Screener,...,Legal Assistance Identified,ILI Assistance Provided,Mona Case Type Identification,Mona Follow up Needed,Vulnerability Threshold,Screening Decision,Date of Screening Decision,Flags,Client Location (Country),Client Nationality
0,7c7e1b0ccbf84198b2e4679d9f0a009b20200301T16562...,7c7e1b0c-cbf8-4198-b2e4-679d9f0a009b,2020-03-04 14:56:02,Brooke Sauro,Mona,2020-03-01,No,Yes,Screening Complete,,...,Vulnerability RST Referral,Pro Se Guidance,,,,,NaT,,,
1,cf2566dc260245dd802e0ddece04e93b20200229T01101...,cf2566dc-2602-45dd-802e-0ddece04e93b,2020-03-04 14:56:01,Brooke Sauro,Mona,2020-02-29,No,Yes,Screening Complete,,...,Vulnerability RST Referral,Pro Se Guidance,,,,,NaT,,,
2,39320c36ec174a27bb2e037659d2fd7520200229T13181...,39320c36-ec17-4a27-bb2e-037659d2fd75,2020-03-04 14:56:00,Brooke Sauro,Mona,2020-02-29,No,Yes,Screening Complete,,...,Vulnerability RST Referral,Pro Se Guidance,,,,,NaT,,,
3,96b29a4d34724a5a9cd4dffd5b3b732320200229T14210...,96b29a4d-3472-4a5a-9cd4-dffd5b3b7323,2020-03-04 14:56:00,Brooke Sauro,Mona,2020-02-29,No,Yes,Screening Complete,,...,Vulnerability RST Referral,Pro Se Guidance,,,,,NaT,,,
4,18e04d06b6d4452f9b70010e8572b73b20200228T20434...,18e04d06-b6d4-452f-9b70-010e8572b73b,2020-03-04 14:55:58,Brooke Sauro,Mona,2020-02-28,No,Yes,Screening Complete,,...,Vulnerability RST Referral,Pro Se Guidance,,,,,NaT,,,


In [57]:
Zendesk.head()

Unnamed: 0,Mona Unique Session ID,Pipeline,Date of Referral,First Referral,In IRAP Process,Screening Stage,Screener,Date of Screening Start,Program Eligibilities,ILI Assistance Provided,...,Screening Decision,Date of Screening Decision,Flags,Client Location (Country),Client Nationality,Replies,First reply time in minutes,First resolution time in minutes,Agent wait time in minutes,Requester wait time in minutes
0,851faec9d8aa4fe2a471eb7fc9ce247e20191014T02543...,Mona,2019-10-26 22:34,Yes,No,Screening Complete,Tiba Fatli,2020-01-27 15:23,Unknown,,...,Cannot Represent,2020-03-31 14:22,Unresponsive,Unknown,Syrian,3,133549.0,225587,92038,133549
1,ac26ef48d78446099e2324d6ed46112b20191012T01264...,Mona,2019-10-26 22:35,Yes,No,Screening Complete,Tiba Fatli,2020-01-28 10:07,Follow-to-Join SIV,,...,Cannot Represent,2020-03-24 13:03,Unresponsive,Lebanon,Syrian,3,134673.0,215428,80756,134673
2,b735446fbc7f49b2b61f1e63da1a757520191012T00365...,Mona,2019-10-26 22:35,No,No,Screening Complete,Manal El Khoury,2020-01-28 06:19,Vulnerability,,...,Cannot Represent,2020-03-27 03:43,Unresponsive,Lebanon,Syrian,3,134445.0,219188,84744,134445
3,7e737b5994fc4addb2b5a1cb70b718cf20191006T23304...,Mona,2019-10-26 22:37,No,No,Screening Complete,Manal El Khoury,2020-01-28 06:01,Vulnerability,,...,Cannot Represent,2020-03-31 10:48,,Palestine,Palestinian,5,134424.0,223546,89344,135993
4,70178665bc234831a6156cc7b809d39920191004T13314...,Mona,2019-10-26 22:37,No,No,Screening Complete,Manal El Khoury,2020-01-28 05:46,USRAP,,...,Cannot Represent,2020-03-30 04:26,Unresponsive,Lebanon,Syrian,5,134408.0,223549,89140,134408


In [63]:
for i in Podio.columns:
    if i in Zendesk.columns:
        print("[Both]:" + i)
    else:
        print("[Only Podio]:" + i)

for i in Zendesk.columns:
    if i not in Podio.columns:
        print("[Only Zendesk]:" + i)

[Both]:Mona Unique Session ID
[Only Podio]:Mona Unique User ID
[Only Podio]:Created on
[Only Podio]:Created by
[Both]:Pipeline
[Both]:Date of Referral
[Both]:First Referral
[Both]:In IRAP Process
[Both]:Screening Stage
[Both]:Screener
[Both]:Date of Screening Start
[Both]:Program Eligibilities
[Only Podio]:Legal Assistance Identified
[Only Podio]:ILI Assistance Provided 
[Both]:Mona Case Type Identification
[Only Podio]: Mona Follow up Needed
[Both]:Vulnerability Threshold
[Both]:Screening Decision
[Both]:Date of Screening Decision
[Both]:Flags
[Both]:Client Location (Country)
[Both]:Client Nationality
[Only Zendesk]:ILI Assistance Provided
[Only Zendesk]:Replies
[Only Zendesk]:First reply time in minutes
[Only Zendesk]:First resolution time in minutes
[Only Zendesk]:Agent wait time in minutes
[Only Zendesk]:Requester wait time in minutes
