# Logisitc Regression Model using SK-Learn

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt

from numpy.random import randn
from numpy.matlib import repmat

from scipy.stats import norm
from scipy.optimize import fmin
from scipy.special import erf

from patsy import dmatrices

from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder

In [2]:
# Utility for displaying our DataFrames
from IPython.display import display_html
def display_side_by_side(*args):
    html_str=''
    for df in args:
        html_str+=df.to_html()
    display_html(html_str.replace('table','table style="display:inline"'),raw=True)

### First let's import the CV exercise to train on...

In [3]:
trainX = pd.read_csv('../match-data/trainX.csv')
trainy = pd.read_csv('../match-data/trainy.csv')

clean = pd.read_csv('../match-data/clean_all.csv')

display_side_by_side(trainy, trainX)

Unnamed: 0,Al - Rawi Company for the manufacture of wires and cables Jordan
0,0
1,0
2,1
3,1
4,1
5,1
6,0
7,0
8,1
9,0

Unnamed: 0,nationality,gender,age,education,night_shift
0,syrian,female,48,secondary,0
1,syrian,female,40,diploma,0
2,syrian,female,25,bachelors,0
3,syrian,male,35,secondary,0
4,syrian,male,38,diploma,1
5,syrian,female,21,bachelors,1
6,syrian,female,44,diploma,0
7,syrian,female,53,none,0
8,syrian,female,20,secondary,0
9,syrian,male,44,primary,1


In [4]:
lb = LabelEncoder()
X = trainX.apply(LabelEncoder().fit_transform)
y = trainy.T.values[0]

model2 = LogisticRegression(max_iter=1000)
model2.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

### And then lets take our set of real candidates and predict their probabilities

In [11]:
clean.dropna(inplace=True)
testX = clean.drop(['Unnamed: 0', 'lat', 'lng'], axis=1)
tx = testX.apply(LabelEncoder().fit_transform)
predicted = model2.predict(tx)
probs = model2.predict_proba(tx)

### Comparison of our predictions vs the true generated probabilities

In [12]:
display_side_by_side(pd.DataFrame(probs.T[1], columns=['candidate fit']), 
                     pd.DataFrame(testX))

Unnamed: 0,candidate fit
0,0.996287
1,0.845234
2,0.998479
3,0.901316
4,0.99537
5,0.640816
6,0.999188
7,0.702628
8,0.666152
9,0.964

Unnamed: 0,nationality,gender,edu,age,night-shift
0,syrian,female,secondary,48.0,0.0
1,syrian,male,secondary,0.0,1.0
2,syrian,male,primary,44.0,1.0
3,syrian,female,secondary,28.0,0.0
4,syrian,female,diploma,40.0,0.0
5,syrian,female,primary,0.0,0.0
6,syrian,female,none,53.0,0.0
7,syrian,female,secondary,20.0,0.0
8,syrian,female,secondary,19.0,0.0
9,syrian,female,bachelors,25.0,0.0
