In [1]:
# Imports.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages
from patsy import dmatrices
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn import tree
from sklearn.cross_validation import cross_val_score
# Allows plots to appear directly in the notebook.
%matplotlib inline
%config IPCompleter.greedy=True



In [2]:
# Read csv file into a dataframe.
df = pd.read_csv('processed_data/happiness_data_alan.csv' , keep_default_na=True, sep=',\s+', delimiter=',', skipinitialspace=True)

In [3]:
len(df)

1344

In [4]:
df['happiness_score'].value_counts()

5.0    435
6.0    342
4.0    260
7.0    233
3.0     40
8.0     34
Name: happiness_score, dtype: int64

In [5]:
happiness_class = (df['happiness_score'] > 5) * 1.0

df_happiness_class = pd.DataFrame({'happiness_class': happiness_class})
df_happiness_class

Unnamed: 0,happiness_class
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0
5,0.0
6,0.0
7,0.0
8,0.0
9,0.0


Interpreting the categorical target feature as a numeric feature with only 2 values: 0 (Not happy) or 1 (happy)

In [6]:
# Crate a new dataframe where we add column HappinessClass to the original dataframe.
df_cross = pd.concat([df, df_happiness_class], axis = 1)
df_cross

Unnamed: 0,country,year,happiness_score,social_support,healthy_life_exp_birth,life_choices,generosity,corruption,pos_affect,neg_affect,confidence_gov,dem_quality,delivery_quality,gdp,life_exp_60,infant_mortality,happiness_class
0,Afghanistan,2008,4.0,0.450662,49.209663,0.718114,0.181819,0.881686,0.517637,0.258195,0.612072,-1.929690,-1.655084,10.297,15.600000,70.800000,0.0
1,Afghanistan,2009,4.0,0.552308,49.624432,0.678896,0.203614,0.850035,0.583926,0.237092,0.611545,-2.044093,-1.635025,12.066,15.700000,68.200000,0.0
2,Afghanistan,2010,5.0,0.539075,50.008961,0.600127,0.137630,0.706766,0.618265,0.275324,0.299357,-1.991810,-1.617176,15.325,15.700000,65.700000,0.0
3,Afghanistan,2011,4.0,0.521104,50.367298,0.495901,0.175329,0.731109,0.611387,0.267175,0.307386,-1.919018,-1.616221,17.890,15.800000,63.300000,0.0
4,Afghanistan,2012,4.0,0.520637,50.709263,0.530935,0.247159,0.775620,0.710385,0.267919,0.435440,-1.842996,-1.404078,20.293,15.800000,61.000000,0.0
5,Afghanistan,2013,4.0,0.483552,51.042980,0.577955,0.074735,0.823204,0.620585,0.273328,0.482847,-1.879709,-1.403036,20.170,15.900000,58.800000,0.0
6,Afghanistan,2014,3.0,0.525568,51.370525,0.508514,0.118579,0.871242,0.531691,0.374861,0.409048,-1.773257,-1.312503,20.616,15.900000,56.800000,0.0
7,Afghanistan,2015,4.0,0.528597,51.693527,0.388928,0.094686,0.880638,0.553553,0.339276,0.260557,-1.844364,-1.291594,20.079,16.000000,54.900000,0.0
8,Afghanistan,2016,4.0,0.559072,52.016529,0.522566,0.057072,0.793246,0.564953,0.348332,0.324990,-1.917693,-1.432548,19.454,15.800000,53.200000,0.0
9,Afghanistan,2017,3.0,0.490880,52.339527,0.427011,-0.106340,0.954393,0.496349,0.371326,0.261179,-1.904737,-1.485251,20.889,15.800000,61.411111,0.0


In [7]:
# Drop the column RentalPrice from the df_classif dataframe.
df_cross.drop('happiness_score', axis=1, inplace=True)
df_cross.head()

Unnamed: 0,country,year,social_support,healthy_life_exp_birth,life_choices,generosity,corruption,pos_affect,neg_affect,confidence_gov,dem_quality,delivery_quality,gdp,life_exp_60,infant_mortality,happiness_class
0,Afghanistan,2008,0.450662,49.209663,0.718114,0.181819,0.881686,0.517637,0.258195,0.612072,-1.92969,-1.655084,10.297,15.6,70.8,0.0
1,Afghanistan,2009,0.552308,49.624432,0.678896,0.203614,0.850035,0.583926,0.237092,0.611545,-2.044093,-1.635025,12.066,15.7,68.2,0.0
2,Afghanistan,2010,0.539075,50.008961,0.600127,0.13763,0.706766,0.618265,0.275324,0.299357,-1.99181,-1.617176,15.325,15.7,65.7,0.0
3,Afghanistan,2011,0.521104,50.367298,0.495901,0.175329,0.731109,0.611387,0.267175,0.307386,-1.919018,-1.616221,17.89,15.8,63.3,0.0
4,Afghanistan,2012,0.520637,50.709263,0.530935,0.247159,0.77562,0.710385,0.267919,0.43544,-1.842996,-1.404078,20.293,15.8,61.0,0.0


In [8]:
# Print the feature types in our dataset.
df_cross.dtypes

country                    object
year                        int64
social_support            float64
healthy_life_exp_birth    float64
life_choices              float64
generosity                float64
corruption                float64
pos_affect                float64
neg_affect                float64
confidence_gov            float64
dem_quality               float64
delivery_quality          float64
gdp                       float64
life_exp_60               float64
infant_mortality          float64
happiness_class           float64
dtype: object

In [9]:
df_cross = df_cross[['happiness_class','social_support', 'healthy_life_exp_birth', 'pos_affect', 'dem_quality', 'delivery_quality', 'life_exp_60', 'infant_mortality']]

In [10]:
df_cross.dtypes

happiness_class           float64
social_support            float64
healthy_life_exp_birth    float64
pos_affect                float64
dem_quality               float64
delivery_quality          float64
life_exp_60               float64
infant_mortality          float64
dtype: object

In [11]:
df_cross.to_csv("processed_data/happiness_class_data.csv", index=False)

# Logistic regression

In [12]:
# Prepare an intercept term that adds a 1 to each example.
# Scikit-learn does not use an intercept by default.
intercept = pd.DataFrame({'Intercept': np.ones(1344)})
intercept.head()

Unnamed: 0,Intercept
0,1.0
1,1.0
2,1.0
3,1.0
4,1.0


In [13]:
# Prepare the descriptive features
X = pd.concat([intercept, df_cross[['social_support', 'healthy_life_exp_birth', 'pos_affect', 'dem_quality', 'delivery_quality', 'life_exp_60', 'infant_mortality']]], axis=1)
y = df_cross['happiness_class']
print("Descriptive features:\n", X)
print("\nTarget feature:\n", y)

Descriptive features:
       Intercept  social_support  healthy_life_exp_birth  pos_affect  \
0           1.0        0.450662               49.209663    0.517637   
1           1.0        0.552308               49.624432    0.583926   
2           1.0        0.539075               50.008961    0.618265   
3           1.0        0.521104               50.367298    0.611387   
4           1.0        0.520637               50.709263    0.710385   
5           1.0        0.483552               51.042980    0.620585   
6           1.0        0.525568               51.370525    0.531691   
7           1.0        0.528597               51.693527    0.553553   
8           1.0        0.559072               52.016529    0.564953   
9           1.0        0.490880               52.339527    0.496349   
10          1.0        0.833047               67.103607    0.640024   
11          1.0        0.733152               67.413696    0.647908   
12          1.0        0.759434               67.73040

In [14]:
X = X.fillna(X.mean())
y = y.fillna(y.mean())

## Training the model

In [15]:
# Split the data into train and test sets
# Take a third (random) data samples as test data, rest as training data
# Note that this training set if very small and the model will not be very reliable due to this sample size problem.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
print("Training data:\n", pd.concat([X_train, y_train], axis=1))
print("\nTest data:\n", pd.concat([X_test, y_test], axis=1))

Training data:
       Intercept  social_support  healthy_life_exp_birth  pos_affect  \
1059        1.0        0.924243               67.716011    0.703485   
621         1.0        0.824806               56.999390    0.765043   
641         1.0        0.871705               65.056374    0.738239   
351         1.0        0.768675               60.434174    0.566759   
875         1.0        0.695814               49.778648    0.650200   
1185        1.0        0.906098               65.486786    0.854544   
898         1.0        0.950128               71.086586    0.849100   
486         1.0        0.834716               74.644836    0.762151   
481         1.0        0.790215               63.247486    0.820184   
512         1.0        0.510575               58.209156    0.628734   
97          1.0        0.907778               62.584282    0.565597   
440         1.0        0.793318               70.288078    0.648514   
935         1.0        0.938647               63.096588    0.

In [16]:
# Train on the training set.
logreg_train = LogisticRegression().fit(X_train, y_train)
# Print the weights learned for each feature.
print("Coeficients on training set: \n", logreg_train.coef_)

Coeficients on training set: 
 [[-3.56479272  3.14226128 -0.02320625  3.26569265  0.23238677  0.39963547
   0.22995058 -0.04590809]]


## Testing the model (using the model to make predictions)

In [17]:
train_output = logreg_train.predict_proba(X_train[['Intercept', 'social_support', 'healthy_life_exp_birth', 'pos_affect', 'dem_quality', 'delivery_quality', 'life_exp_60', 'infant_mortality']])

print(train_output[0:100:1])

[[ 0.18644001  0.81355999]
 [ 0.84902606  0.15097394]
 [ 0.46616852  0.53383148]
 [ 0.86141808  0.13858192]
 [ 0.96337465  0.03662535]
 [ 0.26171227  0.73828773]
 [ 0.03068773  0.96931227]
 [ 0.37257892  0.62742108]
 [ 0.38203781  0.61796219]
 [ 0.94841751  0.05158249]
 [ 0.65225882  0.34774118]
 [ 0.20534948  0.79465052]
 [ 0.27799815  0.72200185]
 [ 0.86977006  0.13022994]
 [ 0.98754654  0.01245346]
 [ 0.15949168  0.84050832]
 [ 0.99329108  0.00670892]
 [ 0.60260185  0.39739815]
 [ 0.36416501  0.63583499]
 [ 0.03121583  0.96878417]
 [ 0.62445996  0.37554004]
 [ 0.97727816  0.02272184]
 [ 0.44611933  0.55388067]
 [ 0.95377387  0.04622613]
 [ 0.68497147  0.31502853]
 [ 0.97860982  0.02139018]
 [ 0.11572115  0.88427885]
 [ 0.33338941  0.66661059]
 [ 0.05102187  0.94897813]
 [ 0.81776338  0.18223662]
 [ 0.31242468  0.68757532]
 [ 0.28116186  0.71883814]
 [ 0.68234731  0.31765269]
 [ 0.03997866  0.96002134]
 [ 0.24888509  0.75111491]
 [ 0.41356058  0.58643942]
 [ 0.14668334  0.85331666]
 

In [18]:
train_predictions = logreg_train.predict(X_train[['Intercept', 'social_support', 'healthy_life_exp_birth', 'pos_affect', 'dem_quality', 'delivery_quality', 'life_exp_60', 'infant_mortality']])

print("Predictions: ", train_predictions[0:100:1])

Predictions:  [ 1.  0.  1.  0.  0.  1.  1.  1.  1.  0.  0.  1.  1.  0.  0.  1.  0.  0.
  1.  1.  0.  0.  1.  0.  0.  0.  1.  1.  1.  0.  1.  1.  0.  1.  1.  1.
  1.  0.  1.  0.  1.  0.  0.  0.  0.  0.  0.  1.  0.  1.  0.  1.  0.  1.
  1.  0.  1.  0.  0.  0.  1.  1.  0.  0.  0.  1.  0.  1.  0.  1.  0.  0.
  0.  1.  1.  0.  0.  1.  1.  0.  1.  0.  1.  0.  1.  1.  0.  1.  1.  1.
  0.  1.  0.  0.  1.  1.  0.  1.  0.  1.]


In [19]:
# Check the accuracy on the training set. 
accuracy = logreg_train.score(X_train, y_train)
print("Accuracy on the training set:", accuracy)

Accuracy on the training set: 0.81914893617


## Classification Evaluation Metrics

In [20]:
# Some more evaluation metrics.
print("Accuracy: ", metrics.accuracy_score(y_train, train_predictions))
print("Confusion matrix: \n", metrics.confusion_matrix(y_train, train_predictions))
print("Classification report:\n ", metrics.classification_report(y_train, train_predictions))

Accuracy:  0.81914893617
Confusion matrix: 
 [[420  82]
 [ 88 350]]
Classification report:
               precision    recall  f1-score   support

        0.0       0.83      0.84      0.83       502
        1.0       0.81      0.80      0.80       438

avg / total       0.82      0.82      0.82       940



## Model evaluation

In [21]:
# Train on the training set and test on the test sample.
logreg_train = LogisticRegression().fit(X_train, y_train)
# Print the weights learned for each feature.
print(logreg_train.coef_)

[[-3.56479273  3.14226126 -0.02320625  3.26569269  0.23238677  0.39963547
   0.22995058 -0.04590809]]


In [22]:
X_test

Unnamed: 0,Intercept,social_support,healthy_life_exp_birth,pos_affect,dem_quality,delivery_quality,life_exp_60,infant_mortality
1340,1.0,0.765839,50.051235,0.725214,-0.985267,-1.484067,17.600000,44.400000
328,1.0,0.632973,51.876041,0.579303,-0.616717,-0.675337,17.400000,60.900000
1151,1.0,0.463913,60.954514,0.369440,-2.448228,-1.548680,19.669297,24.254883
668,1.0,0.717357,67.254700,0.526726,-1.148957,-0.537930,18.700000,9.800000
393,1.0,0.935351,71.310410,0.745672,0.925257,1.445396,24.500000,3.600000
142,1.0,0.787652,67.526260,0.531436,-0.045077,-0.258664,20.500000,5.300000
1156,1.0,0.816993,70.290001,0.846232,0.881086,1.020804,19.669297,24.254883
491,1.0,0.832078,76.268028,0.664093,0.555141,1.821866,19.669297,24.254883
1109,1.0,0.947864,74.094345,0.716266,0.618424,0.873172,25.100000,2.800000
54,1.0,0.923799,72.384338,0.775210,1.196954,1.811844,25.300000,3.400000


In [23]:
# Estimated class probabilities on test set
print(logreg_train.predict_proba(X_test))

[[  9.11293665e-01   8.87063349e-02]
 [  9.74925495e-01   2.50745051e-02]
 [  9.74870732e-01   2.51292677e-02]
 [  7.93780750e-01   2.06219250e-01]
 [  5.46303386e-02   9.45369661e-01]
 [  5.32273977e-01   4.67726023e-01]
 [  3.56139376e-01   6.43860624e-01]
 [  4.62468362e-01   5.37531638e-01]
 [  6.88567132e-02   9.31143287e-01]
 [  3.59520214e-02   9.64047979e-01]
 [  9.87119217e-01   1.28807832e-02]
 [  7.36010214e-01   2.63989786e-01]
 [  6.05231180e-01   3.94768820e-01]
 [  2.41431129e-02   9.75856887e-01]
 [  9.43446498e-01   5.65535020e-02]
 [  2.65725639e-01   7.34274361e-01]
 [  7.18297814e-01   2.81702186e-01]
 [  2.61760519e-01   7.38239481e-01]
 [  2.65403692e-01   7.34596308e-01]
 [  3.43800545e-02   9.65619946e-01]
 [  6.31235779e-01   3.68764221e-01]
 [  2.59762230e-01   7.40237770e-01]
 [  8.38211475e-01   1.61788525e-01]
 [  1.03855377e-01   8.96144623e-01]
 [  4.77004922e-02   9.52299508e-01]
 [  6.69969333e-01   3.30030667e-01]
 [  2.29462799e-01   7.70537201e-01]
 

In [24]:
# Estimated classes on test set
y_predicted = logreg_train.predict(X_test)
print(y_predicted)

[ 0.  0.  0.  0.  1.  0.  1.  1.  1.  1.  0.  0.  0.  1.  0.  1.  0.  1.
  1.  1.  0.  1.  0.  1.  1.  0.  1.  1.  0.  1.  1.  1.  1.  0.  0.  0.
  0.  1.  1.  1.  1.  1.  1.  1.  0.  1.  0.  1.  0.  0.  1.  0.  1.  1.
  0.  0.  1.  1.  0.  0.  0.  0.  0.  1.  0.  0.  1.  0.  1.  0.  0.  1.
  1.  1.  0.  0.  1.  0.  0.  1.  0.  0.  0.  0.  0.  1.  1.  1.  1.  1.
  0.  1.  1.  0.  1.  0.  1.  1.  1.  0.  0.  0.  0.  0.  1.  0.  0.  0.
  0.  1.  0.  0.  0.  0.  1.  0.  0.  0.  1.  1.  1.  1.  1.  1.  1.  0.
  0.  0.  0.  0.  1.  0.  1.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  1.
  0.  0.  0.  1.  0.  1.  0.  1.  0.  1.  1.  0.  0.  0.  0.  0.  1.  0.
  1.  1.  0.  0.  1.  0.  0.  0.  1.  1.  1.  0.  0.  0.  0.  1.  0.  1.
  0.  1.  0.  0.  1.  1.  0.  1.  0.  1.  0.  0.  1.  0.  0.  0.  0.  1.
  1.  0.  0.  1.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  1.  0.  0.  1.
  1.  0.  0.  0.  0.  1.  0.  0.  1.  1.  0.  0.  0.  0.  0.  0.  1.  1.
  1.  0.  1.  1.  1.  1.  1.  0.  1.  1.  0.  0.  0

In [25]:
# Some more evaluation metrics.
print("Accuracy: ", metrics.accuracy_score(y_test, y_predicted))
print("Confusion matrix: \n", metrics.confusion_matrix(y_test, y_predicted))
print("Classification report:\n ", metrics.classification_report(y_test, y_predicted))

Accuracy:  0.834158415842
Confusion matrix: 
 [[203  30]
 [ 37 134]]
Classification report:
               precision    recall  f1-score   support

        0.0       0.85      0.87      0.86       233
        1.0       0.82      0.78      0.80       171

avg / total       0.83      0.83      0.83       404



In [26]:
print(metrics.accuracy_score(y_test, y_predicted))

0.834158415842


In [27]:
print(metrics.f1_score(y_test, y_predicted))

0.8


## Model evaluation using cross-validation

In [28]:
scores = cross_val_score(LogisticRegression(), X, y, scoring='f1', cv=3)
print(scores)
print(scores.mean())

[ 0.84541063  0.79012346  0.75603217]
0.79718875213


In [29]:
scores = cross_val_score(LogisticRegression(), X, y, scoring='accuracy', cv=3)
print(scores)
print(scores.mean())

[ 0.85714286  0.81026786  0.796875  ]
0.821428571429


## Normalise features and retrain the model

In [30]:
# For now work with only the continuous features
#Only keep numeric columns
df_cont = df_cross[['social_support', 'healthy_life_exp_birth', 'pos_affect', 'dem_quality', 'delivery_quality', 'life_exp_60', 'infant_mortality']]
df_cont.head()

Unnamed: 0,social_support,healthy_life_exp_birth,pos_affect,dem_quality,delivery_quality,life_exp_60,infant_mortality
0,0.450662,49.209663,0.517637,-1.92969,-1.655084,15.6,70.8
1,0.552308,49.624432,0.583926,-2.044093,-1.635025,15.7,68.2
2,0.539075,50.008961,0.618265,-1.99181,-1.617176,15.7,65.7
3,0.521104,50.367298,0.611387,-1.919018,-1.616221,15.8,63.3
4,0.520637,50.709263,0.710385,-1.842996,-1.404078,15.8,61.0


In [31]:
# We will first rescale the descriptive features to ranges [-1,1]
#Range normalise all columns to range [-1,1]
df_norm = (df_cont - df_cont.min()) / (df_cont.max() - df_cont.min())
df_norm = df_norm * 2 - 1
#print(df_norm)

# Create a new dataframe df_classif_norm that has all descriptive features rescaled to [-1,1] 
# and the target feature as in original taking values 0 or 1.

df_cross_norm = pd.concat([intercept, df_norm, df_cross.happiness_class], axis=1)
print(df_cross_norm)

      Intercept  social_support  healthy_life_exp_birth  pos_affect  \
0           1.0       -0.539623               -0.469795   -0.466070   
1           1.0       -0.248022               -0.447486   -0.237931   
2           1.0       -0.285985               -0.426804   -0.119746   
3           1.0       -0.337542               -0.407530   -0.143418   
4           1.0       -0.338881               -0.389137    0.197292   
5           1.0       -0.445270               -0.371188   -0.111764   
6           1.0       -0.324733               -0.353571   -0.417700   
7           1.0       -0.316044               -0.336198   -0.342461   
8           1.0       -0.228619               -0.318825   -0.303228   
9           1.0       -0.424247               -0.301452   -0.539337   
10          1.0        0.557355                0.492650   -0.044863   
11          1.0        0.270780                0.509328   -0.017728   
12          1.0        0.346176                0.526363   -0.087416   
13    

In [32]:
X_norm = pd.concat([intercept, df_cross_norm[['social_support', 'healthy_life_exp_birth', 'pos_affect', 'dem_quality', 'delivery_quality', 'life_exp_60', 'infant_mortality']]], axis=1)
y_norm = df_cross_norm['happiness_class']

In [33]:
X_norm = X_norm.fillna(X_norm.mean())
y_norm = y_norm.fillna(y_norm.mean())

In [34]:
X_train_norm, X_test_norm, y_train_norm, y_test_norm = train_test_split(X_norm, y_norm, test_size=0.3, random_state=0)

In [35]:
# Train on the normalise training set.
logreg_train_norm = LogisticRegression().fit(X_train_norm, y_train_norm)
# Print the weights learned for each feature.
print("Coeficients on normalise training set: \n", logreg_train_norm.coef_)

Coeficients on normalise training set: 
 [[-1.64999022  3.4907595   2.12386202  1.61789122  0.04366605  0.27201552
   2.13456712  0.14808939]]


In [36]:
output_train_norm = logreg_train_norm.predict_proba(X_train_norm[['Intercept', 'social_support', 'healthy_life_exp_birth', 'pos_affect', 'dem_quality', 'delivery_quality', 'life_exp_60', 'infant_mortality']])
print(output_train_norm[0:100:1])

[[  1.71326828e-01   8.28673172e-01]
 [  8.07336594e-01   1.92663406e-01]
 [  4.82774596e-01   5.17225404e-01]
 [  9.33165755e-01   6.68342446e-02]
 [  9.87145727e-01   1.28542731e-02]
 [  1.26192820e-01   8.73807180e-01]
 [  1.65710133e-02   9.83428987e-01]
 [  1.50122501e-01   8.49877499e-01]
 [  3.46764271e-01   6.53235729e-01]
 [  9.92847903e-01   7.15209740e-03]
 [  7.44829860e-01   2.55170140e-01]
 [  2.77844450e-01   7.22155550e-01]
 [  8.64894679e-02   9.13510532e-01]
 [  6.57361492e-01   3.42638508e-01]
 [  9.92947016e-01   7.05298395e-03]
 [  1.02850852e-01   8.97149148e-01]
 [  9.99798256e-01   2.01744453e-04]
 [  1.87021018e-01   8.12978982e-01]
 [  5.02164584e-01   4.97835416e-01]
 [  1.84888886e-02   9.81511111e-01]
 [  8.12074130e-01   1.87925870e-01]
 [  9.77381349e-01   2.26186511e-02]
 [  3.34839931e-01   6.65160069e-01]
 [  9.10321755e-01   8.96782453e-02]
 [  8.45468646e-01   1.54531354e-01]
 [  9.91740250e-01   8.25975039e-03]
 [  5.96117656e-02   9.40388234e-01]
 

In [37]:
predictions_norm = logreg_train_norm.predict(X_train_norm[['Intercept', 'social_support', 'healthy_life_exp_birth', 'pos_affect', 'dem_quality', 'delivery_quality', 'life_exp_60', 'infant_mortality']])

print("Predictions on normalise training set: ", predictions_norm[0:100:1])

Predictions on normalise training set:  [ 1.  0.  1.  0.  0.  1.  1.  1.  1.  0.  0.  1.  1.  0.  0.  1.  0.  1.
  0.  1.  0.  0.  1.  0.  0.  0.  1.  1.  1.  0.  1.  1.  0.  1.  1.  0.
  1.  0.  1.  1.  1.  0.  0.  0.  0.  0.  0.  1.  0.  1.  0.  1.  0.  1.
  0.  0.  1.  0.  0.  0.  1.  1.  0.  0.  0.  1.  0.  1.  1.  1.  1.  0.
  0.  1.  1.  0.  0.  1.  1.  1.  1.  1.  1.  0.  1.  1.  0.  1.  1.  1.
  0.  1.  0.  0.  1.  0.  0.  1.  0.  1.]


In [38]:
# Some more evaluation metrics.
print("Accuracy: ", metrics.accuracy_score(y_train_norm, predictions_norm))
print("Confusion matrix: \n", metrics.confusion_matrix(y_train_norm, predictions_norm))
print("Classification report:\n ", metrics.classification_report(y_train_norm, predictions_norm))

Accuracy:  0.85
Confusion matrix: 
 [[436  66]
 [ 75 363]]
Classification report:
               precision    recall  f1-score   support

        0.0       0.85      0.87      0.86       502
        1.0       0.85      0.83      0.84       438

avg / total       0.85      0.85      0.85       940



In [39]:
# Model evaluation
# Estimated class probabilities on test set
print(logreg_train_norm.predict_proba(X_test_norm))

[[  9.49129352e-01   5.08706481e-02]
 [  9.91515587e-01   8.48441313e-03]
 [  9.98011757e-01   1.98824262e-03]
 [  9.02638460e-01   9.73615400e-02]
 [  2.96548641e-02   9.70345136e-01]
 [  7.17316450e-01   2.82683550e-01]
 [  1.91675617e-01   8.08324383e-01]
 [  2.05225229e-01   7.94774771e-01]
 [  2.08773736e-02   9.79122626e-01]
 [  1.93065277e-02   9.80693472e-01]
 [  9.99212235e-01   7.87765483e-04]
 [  3.58774786e-01   6.41225214e-01]
 [  8.16746509e-01   1.83253491e-01]
 [  6.70882863e-03   9.93291171e-01]
 [  9.48775018e-01   5.12249816e-02]
 [  9.70050097e-02   9.02994990e-01]
 [  9.79537538e-01   2.04624623e-02]
 [  1.01456418e-01   8.98543582e-01]
 [  1.66269288e-01   8.33730712e-01]
 [  1.81849981e-02   9.81815002e-01]
 [  7.04517210e-01   2.95482790e-01]
 [  1.34255494e-01   8.65744506e-01]
 [  9.73041656e-01   2.69583443e-02]
 [  1.02994291e-01   8.97005709e-01]
 [  2.90404349e-02   9.70959565e-01]
 [  7.22092922e-01   2.77907078e-01]
 [  5.21542254e-02   9.47845775e-01]
 

In [40]:
# Estimated classes on test set
y_predicted_norm = logreg_train_norm.predict(X_test_norm)
print(y_predicted_norm)

[ 0.  0.  0.  0.  1.  0.  1.  1.  1.  1.  0.  1.  0.  1.  0.  1.  0.  1.
  1.  1.  0.  1.  0.  1.  1.  0.  1.  1.  0.  1.  1.  1.  1.  0.  0.  0.
  0.  1.  1.  1.  1.  1.  1.  1.  1.  1.  0.  1.  0.  0.  1.  0.  1.  1.
  0.  0.  1.  1.  0.  0.  0.  0.  0.  1.  0.  0.  1.  0.  1.  0.  1.  1.
  1.  1.  0.  0.  1.  0.  0.  1.  0.  0.  0.  0.  0.  1.  1.  1.  1.  1.
  0.  0.  1.  0.  1.  1.  1.  1.  1.  1.  0.  0.  0.  0.  1.  0.  0.  0.
  0.  0.  0.  0.  0.  1.  1.  0.  0.  0.  1.  1.  1.  1.  1.  1.  1.  0.
  0.  1.  0.  0.  1.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  1.
  0.  0.  0.  1.  0.  1.  0.  1.  0.  1.  1.  0.  0.  0.  1.  0.  1.  0.
  1.  1.  0.  0.  1.  0.  0.  1.  0.  0.  1.  0.  0.  0.  0.  1.  0.  1.
  0.  1.  0.  1.  1.  1.  0.  1.  0.  1.  0.  0.  1.  0.  0.  0.  0.  1.
  1.  0.  0.  1.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  1.  0.  0.  1.
  1.  0.  0.  0.  0.  1.  0.  0.  1.  1.  0.  0.  0.  0.  1.  0.  1.  1.
  0.  0.  1.  1.  1.  1.  1.  0.  1.  1.  0.  0.  0

In [41]:
# Some more evaluation metrics.
print("Accuracy: ", metrics.accuracy_score(y_test_norm, y_predicted_norm))
print("Confusion matrix: \n", metrics.confusion_matrix(y_test_norm, y_predicted_norm))
print("Classification report:\n ", metrics.classification_report(y_test_norm, y_predicted_norm))

Accuracy:  0.863861386139
Confusion matrix: 
 [[204  29]
 [ 26 145]]
Classification report:
               precision    recall  f1-score   support

        0.0       0.89      0.88      0.88       233
        1.0       0.83      0.85      0.84       171

avg / total       0.86      0.86      0.86       404



In [42]:
# Model evaluation using cross-validation
scores_norm = cross_val_score(LogisticRegression(), X_norm, y_norm, scoring='f1', cv=3)
print(scores_norm)
print(scores_norm.mean())


[ 0.8377724   0.82294264  0.83838384]
0.833032959623


In [43]:
scores_norm = cross_val_score(LogisticRegression(), X_norm, y_norm, scoring='accuracy', cv=3)
print(scores_norm)
print(scores_norm.mean())

[ 0.85044643  0.84151786  0.85714286]
0.849702380952
