In [1]:
#PRE-PROCESSING 
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context("talk",font_scale=2)

In [2]:
CPS_data_frame = pd.read_stata("CPS_data_even_years/CPS_data.dta",
                               convert_categoricals=False, 
                               convert_missing=False)

In [3]:
CPS_data_frame = CPS_data_frame[10000<CPS_data_frame['inctot']] #Looking at income > 10K

In [4]:
CPS_data_frame = CPS_data_frame[CPS_data_frame['inctot']<500000] # & also income < 500K

In [5]:
CPS_data_frame = CPS_data_frame[CPS_data_frame['age'] >= 30] #age >=30 

In [6]:
CPS_data_frame['college'] = (CPS_data_frame['educ99'] >= 15).astype(int) #introducing college attribute (15 == bachelors degree)

In [7]:
np.sum(CPS_data_frame['college']) #adds up how many people have BA/BS degree

93728

In [8]:
from sklearn import linear_model

In [9]:
reg_model = linear_model.LinearRegression()

In [10]:
temp_df = CPS_data_frame[['inctot', 'age', 'sex', 'labforce', 'college']] # TEMP features selected for now
temp_df = temp_df[temp_df['age'] <= 34]

In [11]:
temp_df = temp_df.dropna() #drops rows with missing values

In [12]:
temp_df.head(8)

Unnamed: 0,inctot,age,sex,labforce,college
7,24664.0,30,2,2,0
8,41000.0,33,1,2,0
17,10929.0,34,2,2,0
42,25000.0,33,2,2,0
52,20218.0,31,2,2,0
68,53300.0,34,1,2,1
69,52000.0,33,2,2,1
83,15500.0,31,2,2,0


In [13]:
np.sum(temp_df['labforce'] == 2) #adds up how many people are in the lab force

29747

In [14]:
temp_df['worker'] = (temp_df['labforce'] == 2).astype(int) #converts boolean to integer 

In [15]:
temp_df['female'] = (temp_df['sex'] == 2).astype(int) #converts boolean to integ for female (2== female)

In [16]:
temp_df['loginc'] = np.log(temp_df['inctot'] + 0.01) #normalization of data using natural log

In [17]:
reg_model.fit(temp_df['college'].to_frame(), temp_df['loginc'].to_frame()) #model fitting
# to balance the distribution - we use log of income
# interpretation becomes easier
# the model is more like the real world. example: % raise instead of # raise

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [18]:
print(reg_model.coef_)
print(reg_model.score(temp_df['college'].to_frame(), temp_df['loginc'].to_frame())) 
#1st number: 51% - that's how much more a 'college' person would make 
#2nd number: this model can explain more than 15% of the variation of income of the whole data


[[ 0.51051155]]
0.152728142662


In [19]:
reg_model.fit(np.asarray(temp_df[['college', 'age']]), np.asarray(temp_df['loginc'])) 

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [20]:
print(reg_model.coef_)
print(reg_model.score(np.asarray(temp_df[['college', 'age']]), np.asarray(temp_df['loginc'])))
# example: 2.75% raise

[ 0.50900663  0.02495914]
0.155782226037


In [21]:
reg_model.fit(np.asarray(temp_df[['college', 'female']]),
              np.asarray(temp_df['loginc']))

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [22]:
print(reg_model.coef_)
print(reg_model.score(np.asarray(temp_df[['college', 'female']]),
              np.asarray(temp_df['loginc'])))
#Being female meaning you make less 

[ 0.54218662 -0.28206222]
0.201171592374


In [23]:
reg_model.fit(np.asarray(temp_df[['college', 'age','female']]),
              np.asarray(temp_df['loginc']))

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [24]:
print(reg_model.coef_)
print(reg_model.score(np.asarray(temp_df[['college', 'age','female']]),
              np.asarray(temp_df['loginc'])))

[ 0.5406821   0.0249796  -0.28207681]
0.204230685441


In [25]:
reg_model.fit(np.asarray(temp_df[['college', 'age', 'female', 'worker']]),
              np.asarray(temp_df['loginc']))

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [26]:
print(reg_model.coef_)
print(reg_model.score(np.asarray(temp_df[['college', 'age', 'female', 'worker']]),
              np.asarray(temp_df['loginc'])))

[ 0.52823684  0.02425252 -0.27419702  0.3618321 ]
0.222343869635


In [27]:
len(temp_df) #number of rows - the amount of poeple we are including in our sample

31640

In [28]:
from sklearn import tree
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold


regr_1 = tree.DecisionTreeRegressor(max_depth=2)
regr_2 = tree.DecisionTreeRegressor(max_depth=5)

X = (temp_df[['college', 'age', 'female', 'worker']]).values
y = (temp_df['loginc']).values

kfold = KFold(n_splits=4, shuffle=True, random_state=0)
print("Cross-validation scores for Regression Tree of max_depth 2:\n{}".format(
    cross_val_score(regr_1, X, y, cv=kfold)))

kfold = KFold(n_splits=4, shuffle=True, random_state=0)
print("Cross-validation scores for Regression Tree of max_depth 5:\n{}".format(
    cross_val_score(regr_2, X, y, cv=kfold)))

Cross-validation scores for Regression Tree of max_depth 2:
[ 0.21006854  0.1907208   0.19349355  0.20868698]
Cross-validation scores for Regression Tree of max_depth 5:
[ 0.23254872  0.21096237  0.21634741  0.22966457]


In [29]:
temp_df2 = CPS_data_frame[['inctot', 'age', 'sex', 'labforce', 'college']]
temp_df2 = temp_df2[temp_df2['age'] <= 39]
temp_df2 = temp_df2[temp_df2['age'] >= 35]
temp_df2 = temp_df2.dropna() #drops rows with missing values
temp_df2['worker'] = (temp_df2['labforce'] == 2).astype(int) #converts boolean to integer 
temp_df2['female'] = (temp_df2['sex'] == 2).astype(int) #converts boolean to integ for female (2== female)
temp_df2['loginc'] = np.log(temp_df2['inctot'] + 0.01) #normalization of data using natural log

In [30]:
X_test = (temp_df2[['college', 'age', 'female', 'worker']]).values
y_test = (temp_df2['loginc']).values

In [31]:
reg_model.score(X_test,y_test)

0.24559544864244331

In [169]:
len(X_test)

32799

In [170]:
regr_1.fit(X,y)
print(regr_1.score(X,y))
print(regr_1.score(X_test,y_test))

0.201201964563
0.205536237959


In [171]:
regr_2.fit(X,y)
print(regr_2.score(X,y))
print(regr_2.score(X_test,y_test))

0.224569497201
0.243300076131


In [172]:
from sklearn.ensemble import AdaBoostRegressor

regr_1_AdaBoost = AdaBoostRegressor(n_estimators = 100)

print("Cross-validation scores for AdaBoost with 100 estimators:\n{}".format(
    cross_val_score(regr_2, X, y, cv=kfold)))

Cross-validation scores for AdaBoost with 100 estimators:
[ 0.23254872  0.21096237  0.21634741  0.22966457]


In [173]:
regr_1_AdaBoost.fit(X,y)
print(regr_1_AdaBoost.score(X,y))
print(regr_1_AdaBoost.score(X_test,y_test))

0.213687266878
0.23689166845


In [174]:
len(temp_df2)

32799

In [179]:
X.shape

(31640, 4)

In [180]:
X_test.shape

(32799, 4)

In [183]:
31640 - (temp_df['worker']).sum()

1893

In [32]:
print("Cross-validation scores for the originial linear regression:\n{}".format(
    cross_val_score(reg_model, X, y, cv=kfold)))

Cross-validation scores for the originial linear regression:
[ 0.23125705  0.21121298  0.21616658  0.22898754]
