In [3]:
import pandas as pd
import numpy as np
import _pickle
from os.path import join
import matplotlib.pyplot as plt
from tqdm import tqdm, tqdm_notebook
import xgboost as xgb

In [4]:
from sklearn.model_selection import train_test_split, GridSearchCV

In [55]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier, AdaBoostClassifier,\
GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, roc_auc_score,roc_curve
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree

In [6]:
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN, SMOTETomek



# 1. Load dataframe and variables

In [None]:
inputfile_path = join('/Users','Toavina','githubdata','12.charting_and_modelling','1.pickles','merged_df_concat.pkl')

agg_df = _pickle.load(open(inputfile_path,'rb'))

%matplotlib inline
plt.style.use('ggplot')

pd.options.display.max_columns = 200

# 2. Create Moving Average and Signals to Detect Anomalies

In [None]:
# for each in all_dfs[0].columns:
#     print(each)

users = agg_df.columns.levels[0]

all_dfs = [agg_df[users[i]] for i in range(len(users))]

for df in tqdm_notebook(all_dfs):
    df['EWMA_6'] = df['AggEventsWeighted'].ewm(span=6).mean()
    df['EWMA_3'] = df['AggEventsWeighted'].ewm(span=3).mean()
    df['diff_6'] = df['AggEventsWeighted'] - df['EWMA_6']
    df['diff_3'] = df['AggEventsWeighted'] - df['EWMA_3']
    df['perc_diff_3'] = df['diff_3'] / df['EWMA_3']
    df['perc_diff_6'] = df['diff_6'] / df['EWMA_6']

for df in tqdm_notebook(all_dfs):
    
#     EWMA3_Threshold_80 = 0.8
    EWMA6_Threshold_80 = 0.8
      
#     df['EWMA_3_Signal'] = 0
    df['EWMA_6_Signal'] = 0
    
    for j in range(len(df)):
    
#         if df.ix[j,'perc_diff_3'] > EWMA3_Threshold_80:
#             df.ix[j,'EWMA_3_Signal'] = 1

        if df.ix[j,'perc_diff_6'] > EWMA6_Threshold_80:
            df.ix[j,'EWMA_6_Signal'] = 1

# Charting

In [None]:
def plot_events(usernum):
    
    df = all_dfs[usernum]
    
    min_date = df.index.tolist()[0]
    
    gh_username = df['InferredGHUserCopy'][0]
    
    hn_post_dates = all_dfs[usernum][all_dfs[usernum]['HNPosts'] == 1].index.tolist()
    new_job_dates = all_dfs[usernum][all_dfs[usernum]['NewJobFlag'] == 1].index.tolist()
    end_job_dates = all_dfs[usernum][all_dfs[usernum]['EndJobFlag'] == 1].index.tolist()
    new_edu_dates = all_dfs[usernum][all_dfs[usernum]['StartEduFlag'] == 1].index.tolist()
    end_edu_dates = all_dfs[usernum][all_dfs[usernum]['EndEduFlag'] == 1].index.tolist()
    
    gh_events_series = df['AggEventsWeighted']
    
#     EWMA_3 = df['EWMA_3']
    EWMA_6 = df['EWMA_6']
    
    GH_created_date = all_dfs[usernum]['GHAcctCreatedAt'][0] + pd.offsets.MonthEnd(0, normalize=True)
    
    try:
        plt.axvline(new_edu_dates[0], label = 'New Education')
        [plt.axvline(_x, color='b') for _x in new_edu_dates]
    except:
        pass
    
    try:
        plt.axvline(end_edu_dates[0], label = 'Ended Education')
        [plt.axvline(_x) for _x in end_edu_dates]
    except:
        pass
        
    
    try:
        plt.axvline(hn_post_dates[0], color='b', label = 'HN Job Post')
        [plt.axvline(_x, color='b') for _x in hn_post_dates]
    except:
        pass
    
    if GH_created_date > min_date:
        plt.axvline(GH_created_date, color ='y', label = 'Date GH account created')
    
    try:
        plt.axvline(end_job_dates[0], label = 'Ended Job')
        [plt.axvline(_x) for _x in end_job_dates]
    except:
        pass
    
    try:
        plt.axvline(new_job_dates[0], color='g', label = 'New Job')
        [plt.axvline(_x, color='g') for _x in new_job_dates]
    except:
        pass
    
    gh_events_series.plot(label = "Github Activity (# Events)")
#     EWMA_3.plot(label = "GH Activity EMA_3")
    EWMA_6.plot(label = "GH Activity EMA_6")
    
    plt.legend(bbox_to_anchor=(1.25,0.5), loc = 'lower center', fontsize='small', ncol=1)
    

In [None]:
plot_events(7)

# Model 1 - Predicting whether someone will switch jobs or not in next 3 / 6 months

In [None]:
# Create a column that returns 1 if job within the next 3-6 months, 0 if not
# One Hot Encode Relevant Columns, select the relevant ones, and run a RF on it to see baseline results
# Try both 3 and 6 months. When works, see if can create pipeline, do a grid search and run SVM, Linear Regression ...

# Model 2 - Same as above, but qualifying signals as good or bad

# Model 3 - TBD - Create a points system with my knowledge to qualify each signal, see whether machine can find it

# Model 4 - Unlikely - Using points system, score each item and see results versus the machine as to what is a good
# prospect

In [None]:
def look_fwd_3mo(df,postcolname,newjobcolname):
    
    df['lf_3_job_seeking'] = 0
    
    rows_remaining = len(df)
    
    dates = df.index
    
    for date in dates:
        rows_remaining -= 1
        if rows_remaining >= 3:
            items_to_check = [df.ix[date,postcolname],
                              df.ix[date+1,postcolname],
                              df.ix[date+2,postcolname],
                              df.ix[date+3,postcolname],
                              df.ix[date,newjobcolname],
                              df.ix[date+1,newjobcolname],
                              df.ix[date+2,newjobcolname],
                              df.ix[date+3,newjobcolname]
                             ]
        elif rows_remaining == 2:
            items_to_check = [df.ix[date,postcolname],
                              df.ix[date+1,postcolname],
                              df.ix[date+2,postcolname],
                              df.ix[date,newjobcolname],
                              df.ix[date+1,newjobcolname],
                              df.ix[date+2,newjobcolname]
                             ]
        elif rows_remaining == 1:
            items_to_check = [df.ix[date,postcolname],
                              df.ix[date+1,postcolname],
                              df.ix[date,newjobcolname],
                              df.ix[date+1,newjobcolname]
                             ]
        elif rows_remaining == 0:
            items_to_check = [df.ix[date,postcolname],
                              df.ix[date,newjobcolname]
                             ]
            
        stat_var = 0

        for item in items_to_check:
            if item == 1:
                stat_var = 1
        df.ix[date,'lf_3_job_seeking'] = stat_var
                                      
    return df

for df in tqdm_notebook(all_dfs):
    df = look_fwd_3mo(df,'HNPosts','NewJobFlag')

In [None]:
# _pickle.dump(all_dfs,open('all_dfs.pkl','wb'))

# Loading the model

In [7]:
all_dfs = _pickle.load(open('all_dfs.pkl','rb'))

In [8]:
all_obs = pd.concat(all_dfs, axis=0)

LinkedInOnly = all_obs[pd.notnull(all_obs['LinkedInName'])]

percentpos = LinkedInOnly['lf_3_job_seeking'].sum()/LinkedInOnly.shape[0]
print('The percentage of positive classifications is {0:.0f}'.format(percentpos))

The percentage of positive classifications is 0


# Separating X and Y, One Hot Encoding...

In [9]:
x = LinkedInOnly[['AggEventsEqual','AggEventsWeighted',
              'JobExpInstitutionType0','JobExpTitleType0',
              'JobExpInstitutionType1','JobExpTitleType1',
              'JobExpInstitutionType2','JobExpTitleType2',
              'JobExpInstitutionType3','JobExpTitleType3',
              'JobExpInstitutionType4','JobExpTitleType4',
              'JobExpInstitutionType5','JobExpTitleType5',
              'EduExpInstitutionType0','EduExpTitleType0',
              'EduExpInstitutionType1','EduExpTitleType1',
              'EduExpInstitutionType2','EduExpTitleType2',
              'EduExpInstitutionType3','EduExpTitleType3',
              'EduExpInstitutionType4','EduExpTitleType4',
#               'JobExpCurrentTenure0','JobExpCurrentTenure1','JobExpCurrentTenure2',
#               'JobExpCurrentTenure3','JobExpCurrentTenure4','JobExpCurrentTenure5',
              'NumCurrentJobs','NumCurrentEdu','NumCurrentJobsAndEdu','EmploymentStatus','NEET',
              'EndJobFlag','StartEduFlag','EndEduFlag','CumJobsToDate','CumEduToDate',
              'HighestDegree','HighestInstitutionType',
#             'HighestDegreeTimeSinceStartDate','HighestDegreeTimeSinceEndDate',
              'RecentJob',
              'RecentJobInstitutionType',
#               'RecentJobTimeSinceStartDate','RecentJobTimeSinceEndDate',
              'GHFollowers','GHFollowing',
#                   'GHHireable',      # Note - Removed because issues with setting to False if nothing
                  'PublicGists','PublicRepos','EWMA_6','EWMA_3','diff_6','diff_3',
              'EWMA_3_Signal','EWMA_6_Signal'
             ]]

In [10]:
y = LinkedInOnly['lf_3_job_seeking']
x = x.reset_index()
y = y.reset_index()

In [11]:
x[[col for col in x.columns if 'JobExpInstitutionType' in col]] = \
x[[col for col in x.columns if 'JobExpInstitutionType' in col]].fillna('none')

x[[col for col in x.columns if 'JobExpTitleType' in col]] = \
x[[col for col in x.columns if 'JobExpTitleType' in col]].fillna('none')

x[[col for col in x.columns if 'EduExpInstitutionType' in col]] = \
x[[col for col in x.columns if 'EduExpInstitutionType' in col]].fillna('none')

x[[col for col in x.columns if 'EduExpTitleType' in col]] = \
x[[col for col in x.columns if 'EduExpTitleType' in col]].fillna('none')

x[[col for col in x.columns if 'NumCurrent' in col]] = \
x[[col for col in x.columns if 'NumCurrent' in col]].fillna(0)

In [12]:
jobexpdummies = pd.concat([pd.get_dummies(x['JobExpInstitutionType{}'.format(str(i))], 
                prefix='JobExpInstitutionType{}'.format(str(i))) for i in range(6)], axis=1)

jobexptitledummies = pd.concat([pd.get_dummies(x['JobExpTitleType{}'.format(str(i))], 
                prefix='JobExpTitleType{}'.format(str(i))) for i in range(6)], axis=1)

eduexpdummies = pd.concat([pd.get_dummies(x['EduExpInstitutionType{}'.format(str(i))], 
                prefix='EduExpInstitutionType{}'.format(str(i))) for i in range(5)], axis=1)

eduexptitledummies = pd.concat([pd.get_dummies(x['EduExpTitleType{}'.format(str(i))], 
                prefix='EduExpTitleType{}'.format(str(i))) for i in range(5)], axis=1)


degreedummies = pd.get_dummies(x['HighestDegree'], prefix='HighestDegree')

highestintitutiontypedummies = pd.get_dummies(x['HighestInstitutionType'], prefix='HighestInstitutionType')

recentjobdummies = pd.get_dummies(x['RecentJob'], prefix = 'RecentJob')
recentjobinstitutiontype = pd.get_dummies(x['RecentJobInstitutionType'], prefix='RecentJobInstitutionType')

x = pd.concat([x,jobexpdummies,jobexptitledummies,eduexpdummies,eduexptitledummies,degreedummies,
          highestintitutiontypedummies,recentjobdummies,recentjobinstitutiontype], axis=1)

In [13]:
x = x.drop([
'index',
'JobExpInstitutionType0',
'JobExpTitleType0',
'JobExpInstitutionType1',
'JobExpTitleType1',
'JobExpInstitutionType2',
'JobExpTitleType2',
'JobExpInstitutionType3',
'JobExpTitleType3',
'JobExpInstitutionType4',
'JobExpTitleType4',
'JobExpInstitutionType5',
'JobExpTitleType5',
'EduExpInstitutionType0',
'EduExpTitleType0',
'EduExpInstitutionType1',
'EduExpTitleType1',
'EduExpInstitutionType2',
'EduExpTitleType2',
'EduExpInstitutionType3',
'EduExpTitleType3',
'EduExpInstitutionType4',
'EduExpTitleType4',
'HighestDegree',
'HighestInstitutionType',
'RecentJob',
'RecentJobInstitutionType'   
        ], axis=1)

# Creating test and train sets

In [14]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state = 666)

scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

y_test = y_test.drop('index', axis=1)
y_train = y_train.drop('index', axis=1)

y_train = np.array(y_train)
y_test = np.array(y_test)

y_train = y_train.reshape(y_train.shape[0],)
y_test = y_test.reshape(y_test.shape[0],)

# sm = SMOTEENN()

# X_train, y_train = sm.fit_sample(X_train, y_train)

In [15]:
1- (y_train.sum()/y_train.shape[0])

0.73843162670123141

In [16]:
1- (y_test.sum()/y_test.shape[0])

0.73582337819446542

## 1. Random Forest

In [17]:
rf=RandomForestClassifier(n_jobs = 7, max_features=13,n_estimators=1000)

rf.fit(X_train,y_train)

y_pred = rf.predict(X_test)

y_true = y_test

print(accuracy_score(y_true, y_pred),roc_auc_score(y_true, y_pred))

0.776652048994 0.649738521865


## 2. XGBoost

In [18]:
XGBModel = xgb.XGBClassifier(n_estimators=1000, max_depth=10)

XGBModel.fit(X_train,y_train)

XGBpreds = XGBModel.predict(X_test)

print(accuracy_score(y_true,XGBpreds),roc_auc_score(y_true,XGBpreds))

0.771208226221 0.655028489918


# 3. LogisticRegression

In [21]:
LRModel = LogisticRegression(n_jobs=7)

LRModel.fit(X_train,y_train)

LRPreds = LRModel.predict(X_test)

print(accuracy_score(y_true,LRPreds),roc_auc_score(y_true,LRPreds))

0.747618327537 0.556078990206


# 4 . Support Vector Machines


In [None]:
clf = svm.SVC(verbose=True,kernel='linear')
clf.fit(X_train,y_train)

SVMPreds = clf.predict(X_test)

print(accuracy_score(y_true,SVMPreds),roc_auc_score(y_true,SVMPreds))

# 5. Extra Trees Classifier

In [42]:
Xtrees = ExtraTreesClassifier(n_jobs=7, n_estimators=1000, max_features=13,
                             min_samples_split=4)

Xtrees.fit(X_train,y_train)

XtreePreds = Xtrees.predict(X_test)

print(accuracy_score(y_true,XtreePreds),roc_auc_score(y_true,XtreePreds))

0.783305610162 0.672788252353


# 6. Bagging Classifier

In [None]:
Bags = BaggingClassifier(n_jobs=7, n_estimators=1000, max_features=13)

Bags.fit(X_train,y_train)

BagPreds = Bags.predict(X_test)

print(accuracy_score(y_true,BagPreds),roc_auc_score(y_true,BagPreds))

# 7. AdaBoostClassifier

In [53]:
Ada = AdaBoostClassifier(n_estimators=100)

Ada.fit(X_train, y_train)

Adapreds = Ada.predict(X_test)

Ada.score(X_test,y_true)

print(accuracy_score(y_true,Adapreds),roc_auc_score(y_true,Adapreds))

0.749130500529 0.562243159608


# 8. MultiLayer Perceptron Classifier

In [23]:
MLP = MLPClassifier(verbose=True)

MLP.fit(X_train, y_train)

MLPPreds = MLP.predict(X_test)

print(accuracy_score(y_true,MLPPreds),roc_auc_score(y_true,MLPPreds))

Iteration 1, loss = 0.57894724
Iteration 2, loss = 0.52620754
Iteration 3, loss = 0.51628624
Iteration 4, loss = 0.50928096
Iteration 5, loss = 0.50456962
Iteration 6, loss = 0.49939418
Iteration 7, loss = 0.49446902
Iteration 8, loss = 0.49169144
Iteration 9, loss = 0.48743298
Iteration 10, loss = 0.48428652
Iteration 11, loss = 0.48184342
Iteration 12, loss = 0.48121135
Iteration 13, loss = 0.47855487
Iteration 14, loss = 0.47465397
Iteration 15, loss = 0.47373064
Iteration 16, loss = 0.47230445
Iteration 17, loss = 0.46872058
Iteration 18, loss = 0.46704295
Iteration 19, loss = 0.46481878
Iteration 20, loss = 0.46579416
Iteration 21, loss = 0.46130931
Iteration 22, loss = 0.46107429
Iteration 23, loss = 0.45884096
Iteration 24, loss = 0.45792914
Iteration 25, loss = 0.45539907
Iteration 26, loss = 0.45534734
Iteration 27, loss = 0.45716991
Iteration 28, loss = 0.45190174
Iteration 29, loss = 0.45003415
Iteration 30, loss = 0.45147597
Iteration 31, loss = 0.44893726
Iteration 32, los

# 9. SGDClassifier

In [None]:
SGD = SGDClassifier(verbose=True, n_jobs=2)

SGD.fit(X_train_transformed, y_train)

SGDPreds = SGD.predict(X_test)

print(SGD.score(X_test,y_true))

print(accuracy_score(y_true,SGDPreds),roc_auc_score(y_true,SGDPreds))

# 10. Naive Bayes

In [None]:
NB = GaussianNB()

NB.fit(X_train, y_train)

NBPreds = NB.predict(X_test)

print(accuracy_score(y_true,NBPreds),roc_auc_score(y_true,NBPreds))

# 11. knn

In [59]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train,y_train)
knnpreds = knn.predict(X_test)
print(accuracy_score(y_true,knnpreds),roc_auc_score(y_true,knnpreds))

0.754423106003 0.63958689325


# 12. Decision Tree

In [52]:
dtree = tree.DecisionTreeClassifier(max_depth=10, min_samples_leaf=5)
dtree.fit(X_train,y_train)
dtreepreds = dtree.predict(X_test)
print(accuracy_score(y_true,dtreepreds),roc_auc_score(y_true,dtreepreds))

0.75306215031 0.581975712695


# 13. Gradient Boosting Classifier

In [58]:
GBT = GradientBoostingClassifier(verbose=True, n_estimators=1000)
GBT.fit(X_train,y_train)
GBTpreds = GBT.predict(X_test)
print(accuracy_score(y_true,GBTpreds),roc_auc_score(y_true,GBTpreds))

      Iter       Train Loss   Remaining Time 
         1           1.1362            1.19m
         2           1.1259            1.17m
         3           1.1175            1.17m
         4           1.1106            1.18m
         5           1.1048            1.19m
         6           1.0998            1.18m
         7           1.0956            1.18m
         8           1.0915            1.16m
         9           1.0882            1.16m
        10           1.0852            1.16m
        20           1.0603            1.14m
        30           1.0450            1.16m
        40           1.0339            1.15m
        50           1.0257            1.10m
        60           1.0188            1.05m
        70           1.0129            1.03m
        80           1.0079            1.01m
        90           1.0026           59.72s
       100           0.9990           57.58s
       200           0.9622           49.56s
       300           0.9339           43.29s
       40