In [1]:
#PRE-PROCESSING 
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context("talk",font_scale=2)

In [2]:
CPS_data_frame = pd.read_stata("CPS_data_even_years/CPS_data.dta",
                               convert_categoricals=False, 
                               convert_missing=False)

In [3]:
CPS_data_frame = CPS_data_frame[10000<CPS_data_frame['inctot']] #Looking at income > 10K

In [4]:
CPS_data_frame = CPS_data_frame[CPS_data_frame['inctot']<500000] # & also income < 500K

In [5]:
CPS_data_frame = CPS_data_frame[CPS_data_frame['age'] >= 30] #age >=30 

In [6]:
CPS_data_frame = CPS_data_frame[CPS_data_frame['age'] <= 34] # & <= 34

In [7]:
CPS_data_frame['college'] = (CPS_data_frame['educ99'] >= 15).astype(int) #introducing college attribute (15 == bachelors degree)

In [8]:
np.sum(CPS_data_frame['college']) #adds up how many people have BA/BS degree

12171

In [9]:
from sklearn import linear_model

In [10]:
reg_model = linear_model.LinearRegression()

In [11]:
temp_df = CPS_data_frame[['inctot', 'age', 'sex', 'labforce', 'college']] # TEMP features selected for now

In [12]:
temp_df = temp_df.dropna() #drops rows with missing values

In [13]:
temp_df.head(8)

Unnamed: 0,inctot,age,sex,labforce,college
7,24664.0,30,2,2,0
8,41000.0,33,1,2,0
17,10929.0,34,2,2,0
42,25000.0,33,2,2,0
52,20218.0,31,2,2,0
68,53300.0,34,1,2,1
69,52000.0,33,2,2,1
83,15500.0,31,2,2,0


In [14]:
np.sum(temp_df['labforce'] == 2) #adds up how many people are in the lab force

29747

In [15]:
temp_df['worker'] = (temp_df['labforce'] == 2).astype(int) #converts boolean to integer 

In [16]:
temp_df['female'] = (temp_df['sex'] == 2).astype(int) #converts boolean to integ for female (2== female)

In [17]:
temp_df['loginc'] = np.log(temp_df['inctot'] + 0.01) #normalization of data using natural log

In [18]:
reg_model.fit(temp_df['college'].to_frame(), temp_df['loginc'].to_frame()) #model fitting
# to balance the distribution - we use log of income
# interpretation becomes easier
# the model is more like the real world. example: % raise instead of # raise

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [19]:
print(reg_model.coef_)
print(reg_model.score(temp_df['college'].to_frame(), temp_df['loginc'].to_frame())) 
#1st number: 51% - that's how much more a 'college' person would make 
#2nd number: this model can explain more than 15% of the variation of income of the whole data


[[ 0.51051155]]
0.152728142662


In [20]:
reg_model.fit(np.asarray(temp_df[['college', 'age']]), np.asarray(temp_df['loginc'])) 

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [21]:
print(reg_model.coef_)
print(reg_model.score(np.asarray(temp_df[['college', 'age']]), np.asarray(temp_df['loginc'])))
# example: 2.75% raise

[ 0.50900663  0.02495914]
0.155782226037


In [22]:
reg_model.fit(np.asarray(temp_df[['college', 'female']]),
              np.asarray(temp_df['loginc']))

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [23]:
print(reg_model.coef_)
print(reg_model.score(np.asarray(temp_df[['college', 'female']]),
              np.asarray(temp_df['loginc'])))
#Being female meaning you make less 

[ 0.54218662 -0.28206222]
0.201171592374


In [24]:
reg_model.fit(np.asarray(temp_df[['college', 'age','female']]),
              np.asarray(temp_df['loginc']))

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [25]:
print(reg_model.coef_)
print(reg_model.score(np.asarray(temp_df[['college', 'age','female']]),
              np.asarray(temp_df['loginc'])))

[ 0.5406821   0.0249796  -0.28207681]
0.204230685441


In [26]:
reg_model.fit(np.asarray(temp_df[['college', 'age', 'female', 'worker']]),
              np.asarray(temp_df['loginc']))

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [27]:
print(reg_model.coef_)
print(reg_model.score(np.asarray(temp_df[['college', 'age', 'female', 'worker']]),
              np.asarray(temp_df['loginc'])))

[ 0.52823684  0.02425252 -0.27419702  0.3618321 ]
0.222343869635


In [28]:
len(temp_df) #number of rows - the amount of poeple we are including in our sample

31640