In [1]:
import zipfile
with zipfile.ZipFile("CPS_data_even_years.zip","r") as zip_ref:
    zip_ref.extractall("CPS_data_even_years")
with zipfile.ZipFile("output4909048052568705413.zip","r") as zip_ref:
    zip_ref.extractall("ESS_Cumulative")

In [2]:
#PRE-PROCESSING 
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context("talk",font_scale=2)

In [3]:
np.random.seed(47)

In [5]:
CPS_data_frame = pd.read_stata("CPS_data_even_years/CPS_data.dta",
                               convert_categoricals=False, 
                               convert_missing=False)

In [6]:
CPS_data_frame = CPS_data_frame[(10000 < \
                                 CPS_data_frame['inctot']) & (CPS_data_frame['inctot'] < \
                                                              500000)] 

In [45]:
CPS_data_frame = CPS_data_frame[CPS_data_frame['age'] >= 30] 

In [46]:
CPS_data_frame['worker'] = (CPS_data_frame['labforce'] == 2).astype(int)
CPS_data_frame['female'] = (CPS_data_frame['sex'] == 2).astype(int)
CPS_data_frame['loginc'] = np.log(CPS_data_frame['inctot'])

In [47]:
CPS_data_frame['college'] = (CPS_data_frame['educ99'] >= 15).astype(int)

In [48]:
CPS_data_frame['masters'] = (CPS_data_frame['educ99'] == 16).astype(int)

In [49]:
CPS_data_frame['high_school'] = (CPS_data_frame['educ99'] >= 10).astype(int)

In [50]:
CPS_data_frame['professional_degree'] = (CPS_data_frame['educ99'] == 17).astype(int)

In [51]:
CPS_data_frame['doctorate'] = (CPS_data_frame['educ99'] == 18).astype(int)

In [68]:
CPS_data_frame['white'] = (CPS_data_frame['race'] == 100).astype(int)

In [69]:
CPS_data_frame['not_hispanic'] = (CPS_data_frame['hispan'] == 0).astype(int)

In [73]:
list_of_features = ['loginc', 'age', 'female', 'worker', 'white', 'not_hispanic',
                    'college', 'masters', 'high_school', 'professional_degree', 'doctorate']

In [74]:
temp_df = CPS_data_frame[list_of_features]

In [75]:
temp_df = temp_df[temp_df['age'] <= 34]

In [78]:
temp_df = temp_df.dropna()

In [79]:
X = temp_df.drop('loginc', 1).values
y = (temp_df['loginc']).values

In [80]:
temp_df2 = CPS_data_frame[list_of_features]
temp_df2 = temp_df2[temp_df2['age'] <= 39]
temp_df2 = temp_df2[temp_df2['age'] >= 35]
temp_df2 = temp_df2.dropna()

In [81]:
X_test = temp_df2.drop('loginc', 1).values
y_test = (temp_df2['loginc']).values

In [82]:
from sklearn import tree
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold


regr_1 = tree.DecisionTreeRegressor(max_depth=4)
regr_2 = tree.DecisionTreeRegressor(max_depth=5)



kfold = KFold(n_splits=4, shuffle=True, random_state=47)
print("Cross-validation scores for Regression Tree of max_depth 2:\n{}".format(
    cross_val_score(regr_1, X, y, cv=kfold)))

print("Cross-validation scores for Regression Tree of max_depth 5:\n{}".format(
    cross_val_score(regr_2, X, y, cv=kfold)))

Cross-validation scores for Regression Tree of max_depth 2:
[ 0.24869181  0.24268683  0.24201019  0.26031553]
Cross-validation scores for Regression Tree of max_depth 5:
[ 0.25856638  0.25037678  0.25136076  0.27223539]


In [83]:
regr_1.fit(X,y)
print(regr_1.score(X,y))
print(regr_1.score(X_test,y_test))

0.250604281042
0.261050900752


In [84]:
regr_2.fit(X,y)
print(regr_2.score(X,y))
print(regr_2.score(X_test,y_test))

0.262165680839
0.271610874327


In [85]:
from sklearn.ensemble import AdaBoostRegressor

regr_1_AdaBoost = AdaBoostRegressor(n_estimators = 100)

print("Cross-validation scores for AdaBoost with 100 estimators:\n{}".format(
    cross_val_score(regr_1_AdaBoost, X, y, cv=kfold)))

Cross-validation scores for AdaBoost with 100 estimators:
[ 0.21369035  0.21420788  0.23026047  0.23813101]


In [86]:
regr_1_AdaBoost.fit(X,y)
print(regr_1_AdaBoost.score(X,y))
print(regr_1_AdaBoost.score(X_test,y_test))

0.228490771463
0.245468204933


In [87]:
from sklearn.ensemble import RandomForestRegressor

regr_1_RF = RandomForestRegressor(n_estimators = 100)

print("Cross-validation scores for RandomForest with 100 estimators:\n{}".format(
    cross_val_score(regr_1_RF, X, y, cv=kfold)))

Cross-validation scores for RandomForest with 100 estimators:
[ 0.26564672  0.25646533  0.26126771  0.27803155]


In [88]:
regr_1_RF.fit(X,y)
print(regr_1_RF.score(X,y))
print(regr_1_RF.score(X_test,y_test))

0.285597880184
0.291952151432


In [89]:
from sklearn.neural_network import MLPRegressor

regr_1_MLP = MLPRegressor(solver='lbfgs', random_state=47, hidden_layer_sizes=[10])


print("Cross-validation scores for Backpropogation with default settings:\n{}".format(
    cross_val_score(regr_1_MLP, X, y, cv=kfold)))

Cross-validation scores for Backpropogation with default settings:
[ 0.26490109  0.25916269  0.25994496  0.27709341]


In [92]:
regr_1_MLP.fit(X,y)
print(regr_1_MLP.score(X,y))
regr_1_MLP.fit(X,y)
print(regr_1_MLP.score(X_test,y_test))

0.251309769913
0.207919493377


In [111]:
ESS_data_frame = pd.read_stata("ESS_Cumulative/ESS1-6e01_1_F1.dta",
                               convert_categoricals=False, 
                               convert_missing=False)

In [122]:
ESS_data_frame = ESS_data_frame[ESS_data_frame['agea'] <= 53] 
ESS_data_frame = ESS_data_frame[ESS_data_frame['agea'] >= 30] 

In [141]:
ESS_data_frame['female'] = (ESS_data_frame['gndr'] == 2).astype(int)

ESS_data_frame['born_in_country'] = (ESS_data_frame['brncntr'] == 1).astype(int)

ESS_data_frame['college'] = (ESS_data_frame['eisced'] == 6).astype(int)
ESS_data_frame['post_college'] = (ESS_data_frame['eisced'] == 7).astype(int)

ESS_data_frame['father_college'] = (ESS_data_frame['eiscedf'] == 6).astype(int)
ESS_data_frame['father_post_college'] = (ESS_data_frame['eiscedf'] == 7).astype(int)

ESS_data_frame['mother_college'] = (ESS_data_frame['eiscedm'] == 6).astype(int)
ESS_data_frame['mother_post_college'] = (ESS_data_frame['eiscedm'] == 7).astype(int)

ESS_data_frame['partner_college'] = (ESS_data_frame['eiscedp'] == 6).astype(int)
ESS_data_frame['partner_post_college'] = (ESS_data_frame['eiscedp'] == 7).astype(int)

In [142]:
ESS_features = ['hinctnta', 'agea', 'female', 'born_in_country', 'college', 'post_college',
                'father_college', 'father_post_college', 
                'mother_college', 'mother_post_college',
                'partner_college', 'partner_post_college']

In [143]:
ESS_temp_df = ESS_data_frame[ESS_features]
ESS_temp_df = ESS_temp_df.dropna()

In [144]:
ESS_temp_df.shape

(48736, 12)

In [149]:
from sklearn.model_selection import train_test_split

ESS_X_train, ESS_X_test, ESS_y_train, ESS_y_test = train_test_split(
    ESS_temp_df.drop('hinctnta', 1), ESS_temp_df['hinctnta'], random_state = 47)

In [189]:
class_tree_1 = tree.DecisionTreeClassifier(max_depth=4)
class_tree_2 = tree.DecisionTreeClassifier(max_depth=5)

kfold = KFold(n_splits=4, shuffle=True, random_state=47)
print("Cross-validation scores for Decision Tree of max_depth 2:\n{}".format(
    cross_val_score(class_tree_1, ESS_X_train, ESS_y_train, cv=kfold, scoring='f1_micro')))

print("Cross-validation scores for Decision Tree of max_depth 5:\n{}".format(
    cross_val_score(class_tree_2, ESS_X_train, ESS_y_train, cv=kfold, scoring='f1_micro')))

Cross-validation scores for Decision Tree of max_depth 2:
[ 0.1509083   0.14456117  0.1476253   0.1466404 ]
Cross-validation scores for Decision Tree of max_depth 5:
[ 0.14532721  0.14401401  0.14313854  0.1456555 ]


In [169]:
class_tree_1.fit(ESS_X_train, ESS_y_train)
print(class_tree_1.score(ESS_X_train, ESS_y_train))
print(class_tree_1.score(ESS_X_test, ESS_y_test))

0.150470562486
0.150114904793


In [170]:
class_tree_2.fit(ESS_X_train, ESS_y_train)
print(class_tree_2.score(ESS_X_train, ESS_y_train))
print(class_tree_2.score(ESS_X_test, ESS_y_test))

0.153698840009
0.152659225213


In [175]:
from sklearn.ensemble import AdaBoostClassifier

class_AdaBoost_1 = AdaBoostClassifier(n_estimators = 100)

print("Cross-validation scores for AdaBoost with 100 estimators:\n{}".format(
    cross_val_score(class_AdaBoost_1, ESS_X_train, ESS_y_train, cv=kfold)))

Cross-validation scores for AdaBoost with 100 estimators:
[ 0.1495951   0.14718757  0.14718757  0.14784417]


In [177]:
from sklearn.ensemble import RandomForestClassifier

class_RF_1 = RandomForestClassifier(n_estimators = 100)

print("Cross-validation scores for RandomForest with 100 estimators:\n{}".format(
    cross_val_score(class_RF_1, ESS_X_train, ESS_y_train, cv=kfold)))

Cross-validation scores for RandomForest with 100 estimators:
[ 0.13504049  0.13066316  0.13318013  0.13318013]


In [180]:
from sklearn.neural_network import MLPClassifier

class_MLP_1 = MLPClassifier(solver = 'lbfgs', random_state=47)

print("Cross-validation scores for Backpropagation:\n{}".format(
    cross_val_score(class_MLP_1, ESS_X_train, ESS_y_train, cv=kfold)))

Cross-validation scores for Backpropagation:
[ 0.1479536   0.14871963  0.14948566  0.14784417]


In [181]:
from sklearn.naive_bayes import GaussianNB

In [184]:
nb_classifier_1 = GaussianNB()

print("Cross-validation scores for Naive Bayes:\n{}".format(
    cross_val_score(nb_classifier_1, ESS_X_train, ESS_y_train, cv=kfold)))

Cross-validation scores for Naive Bayes:
[ 0.12256511  0.1291311   0.12738017  0.12628584]


In [185]:
cross_val_score?