In [1]:
import pandas as pd
import numpy as np
import functions as f
from sklearn import linear_model

In [109]:
# choose cols to import. goes very slowly if you try to import everything, and some cols make it error. see file
import_cols = ['racdin', 'year', 'age', 'sex', 'race', 'region', 'educ', 'relig']
# import_cols = ['racdin', 'year', 'age', 'sex']
# these seem like all of the relevant categories. Some of them don't plot nicely yet.

# make sure that in your repo, there is a symlink (named 'data') to the real 'data' folder
def data_loc(filename):
    import os
    return os.path.join(os.readlink('data'), filename)

# if you make changes to functions.py, use this before running anything:
reload(f)

# chunksize must be None or >= 57061 rows (for GSS), because of pandas flaw
gss = f.load_dta(data_loc('GSS7212_R2.DTA'), chunksize = None,
                 columns = import_cols)



Loaded 57061 rows...
Done!


In [110]:
# what the df looks like before modification
gss.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 57061 entries, 0 to 57060
Data columns (total 8 columns):
racdin    12033 non-null category
year      57061 non-null int16
age       56859 non-null category
sex       57061 non-null category
race      57061 non-null category
region    57061 non-null category
educ      56897 non-null category
relig     56828 non-null category
dtypes: category(7), int16(1)
memory usage: 948.3 KB


In [111]:
# drop all rows where y variable is NA
gss = gss[gss['racdin'].notnull()]
# change age to number (89+ just coded as 89)
gss.age = gss.age.cat.codes + 18

In [112]:
# map sex to numeric
gss.sex = gss.sex.map({'female': 0, 'male': 1}).astype(int)

In [113]:
# change racdin to numeric
gss.racdin = gss.racdin.map({'strongly object': -1, 'mildly object': 0, 'not object': 1}).astype(int)

In [116]:
# fill in missing educ values with mean
gss.educ = gss.educ.cat.codes
educ_mean = gss.educ.mean()
gss.educ = gss.educ.replace(-1, educ_mean)

In [117]:
# change categorical variables to one-hot encoding
gss = pd.concat([gss, pd.get_dummies(gss['race'], prefix='race')], axis=1)
gss = pd.concat([gss, pd.get_dummies(gss['region'], prefix='region')], axis=1)
gss = pd.concat([gss, pd.get_dummies(gss['relig'], prefix='relig')], axis=1)

In [118]:
gss = gss.drop(['race', 'region', 'relig'], axis=1)

In [129]:
# add interactions with time
columns = gss.columns.tolist()
for c in columns[2:]:
    gss[c + '_X_year'] = gss[c] * gss['year']

In [130]:
gss.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12033 entries, 0 to 18585
Data columns (total 58 columns):
racdin                                  12033 non-null int64
year                                    12033 non-null int16
age                                     12033 non-null int8
sex                                     12033 non-null int64
educ                                    12033 non-null float64
race_white                              12033 non-null float64
race_black                              12033 non-null float64
race_other                              12033 non-null float64
region_new england                      12033 non-null float64
region_middle atlantic                  12033 non-null float64
region_e. nor. central                  12033 non-null float64
region_w. nor. central                  12033 non-null float64
region_south atlantic                   12033 non-null float64
region_e. sou. central                  12033 non-null float64
region_w. sou. cen

In [148]:
# randomly split out 2000 test examples
gss = gss.iloc[np.random.permutation(len(gss))]
traindata = gss.values[0:(len(gss)-2000), 0:]
testdata = gss.values[(len(gss)-2000):, 0:]
print traindata.shape
print testdata.shape

(10033, 58)
(2000, 58)


In [149]:
regmodel = linear_model.Ridge()
regmodel.fit(traindata[0:, 1:], traindata[0:, 0])

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [150]:
print regmodel.score(traindata[0:, 1:], traindata[0:, 0])
print regmodel.score(testdata[0:, 1:], testdata[0:, 0])

0.109869632717
0.124533292165


12033