In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline
from pipeline import *
%autoreload

from sklearn.mixture import GaussianMixture

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [10]:
path = '../data/2017 Pew Research Center Science and News Survey/Segmentation_data.csv'

In [11]:
data, segment_data = get_cluster_data(path)
gm3 = GaussianMixture(n_components = 3, covariance_type=  'spherical', random_state=20)
gm3.fit(segment_data)
labelsg3 = gm3.predict(segment_data)
data['SEGMENT'] = labelsg3

#### Drop Segment Variables

In [12]:
#data.drop(labels = segment_vars, axis= 1, inplace = True)

#### Train Test Split

In [13]:
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .33, random_state = 20)

In [23]:
X_train['LIST1_b']

CaseID
3846    1
3293    0
317     0
1119    0
2347    0
3218    1
1443    0
977     0
360     0
2730    0
3845    0
660     0
3282    1
304     0
1209    1
905     1
1738    0
1337    0
1796    0
866     0
2399    1
1478    0
1820    0
3262    1
2640    0
3454    0
3748    1
1184    0
535     0
2149    0
       ..
3113    0
3594    1
704     1
2193    0
2832    1
3060    1
2790    0
3459    0
2609    0
2927    1
3025    0
3580    0
1543    0
1910    0
715     0
1161    0
920     0
1746    0
2724    0
1471    0
2736    0
698     0
2235    1
811     0
2408    0
2731    0
2136    0
3755    0
390     0
3897    0
Name: LIST1_b, Length: 1841, dtype: int64

## Feature Selection

### Model 1
- Taking age out doesnt improve model

In [8]:
features1 = ['PPGENDER', 'TOPICINT_e', 'PPINCIMP', 'PPEDUCAT', 'ppagecat']

In [9]:
X_train1 = X_train[features1]

In [10]:
X_train1.head()

Unnamed: 0_level_0,PPGENDER,TOPICINT_e,PPINCIMP,PPEDUCAT,ppagecat
CaseID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3846,1,1.0,3.0,4,5
3293,1,2.0,5.0,4,3
317,2,2.0,4.0,4,3
1119,2,2.0,4.0,3,7
2347,2,2.0,3.0,2,2


In [11]:
# Gender 1: Male 0: Female
X_train1['PPGENDER'] = X_train1['PPGENDER'].apply(lambda x: 1 if x==1 else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [12]:
topic_dummies = pd.get_dummies(X_train1['TOPICINT_e'], drop_first = True)

topic_dummies.rename(columns = {2.0: 'SciNews: Somewhat Interested',
                               3.0: 'SciNews: Not Too Interested',
                               4.0: 'SciNews: Not At All Interested'}, inplace = True)

age_dummies = pd.get_dummies(X_train1['ppagecat'], drop_first = True) 
age_dummies.rename(columns = {2: '25-34', 3: '35-44', 4: '45-54',
                              5: '55-64', 6: '65-74', 7: '75+'}, inplace = True)

inc_dummies = pd.get_dummies(X_train1['PPINCIMP'], drop_first = True)
inc_dummies.rename(columns = {2.0: '20-40k', 3.0: '40-60k', 4.0: '60-85k', 5.0: '85-125k', 
                              6.0: '125-175k', 7.0:'175-200k', 8.0: '200k+'}, inplace = True)

edu_dummies = pd.get_dummies(X_train1['PPEDUCAT'], drop_first = True)
edu_dummies.rename(columns = {2: 'Completed HS', 3: 'Some College', 4: 'Bachelors+'}, inplace = True)

In [13]:
X_train1 = pd.concat((X_train1, topic_dummies, inc_dummies, age_dummies, edu_dummies), axis = 1)
X_train1.drop(labels = ['TOPICINT_e', 'PPINCIMP', 'PPEDUCAT', 'ppagecat'], axis = 1, inplace = True)

In [14]:
X_train1.head()

Unnamed: 0_level_0,PPGENDER,SciNews: Somewhat Interested,SciNews: Not Too Interested,SciNews: Not At All Interested,20-40k,40-60k,60-85k,85-125k,125-175k,175-200k,200k+,25-34,35-44,45-54,55-64,65-74,75+,Completed HS,Some College,Bachelors+
CaseID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
3846,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1
3293,1,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1
317,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1
1119,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0
2347,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0


#### Logistic Regression

In [15]:
log_reg = LogisticRegression().fit(X_train1, y_train)

In [16]:
log_reg.score(X_train1, y_train)

0.6610537751222162

In [17]:
cv_lr = cross_val_score(log_reg, X_train1, y_train, scoring = 'accuracy', cv = 5).mean()
cv_lr

0.6567288043878495

#### Random Forest
- Grid Search Best Parameters: 
    - Max depth: 10
    - n_estimators: 100

In [18]:
rf = RandomForestClassifier(n_estimators = 100, max_depth=10).fit(X_train1, y_train)

In [19]:
cv_rf = cross_val_score(rf, X_train1, y_train, scoring = 'accuracy', cv = 5).mean()
cv_rf

0.6311970289801707

##### Grid Search

In [20]:
# rf_params ={'n_estimators': [50, 100, 200, 500], 'max_depth' : [1, 2, 10, 50]}
# gs_rf = GridSearchCV(rf, rf_params, scoring = 'accuracy', cv = 5)
# gs_rf.fit(X_train, y_train)

In [21]:
# gs_rf.best_params_
# # Best Params: {'max_depth': 10, 'n_estimators': 100}
# gs_rf.best_score_

#### Gradient Boost
- Gird Search Best Params
    - Learning Rate: .1
    - max_depth: 1
    - n_etimators: 100
    - subsample = .25

In [22]:
gb = GradientBoostingClassifier(learning_rate=.1, max_depth = 1, n_estimators = 100, subsample=.25).fit(X_train1, y_train)

In [23]:
gb.score(X_train1, y_train)

0.6610537751222162

In [24]:
cv_gb = cross_val_score(gb, X_train1, y_train, scoring = 'accuracy', cv = 5).mean()
cv_gb

0.6529303320024206

##### Grid Search

In [25]:
# gb_params ={'learning_rate': [.001, .01, .1], 'n_estimators' : [50, 100, 200],
#            'subsample': [.25, .5, .75, 1], 'max_depth': [1, 2, 10, 50]}
# gs_gb = GridSearchCV(gb, gb_params, scoring = 'accuracy', cv = 5)
# gs_gb.fit(X_train, y_train)

In [26]:
# gs_gb.best_params_
# # Best Params: {'learning_rate': 0.1, 'max_depth': 1, 'n_estimators': 100, 'subsample': 0.25}
# gs_gb.best_score_

### Model 2
- Added SCIWHY_e
- Added TOPICINT_d

In [36]:
features2 = ['PPGENDER', 'TOPICINT_e', 'ppagecat', 'PPINCIMP', 'PPEDUCAT', 'SCIWHY_e', 'TOPICINT_d']

In [37]:
X_train2 = X_train[features2]
X_test2 = X_test[features2]

In [38]:
## TRAIN
# Gender 1: Male 0: Female
X_train2['PPGENDER'] = X_train2['PPGENDER'].apply(lambda x: 1 if x==1 else 0)

topic_dummies = pd.get_dummies(X_train2['TOPICINT_e'], drop_first = True)
topic_dummies.rename(columns = {2.0: 'SciNews: Somewhat Interested',
                               3.0: 'SciNews: Not Too Interested',
                               4.0: 'SciNews: Not At All Interested'}, inplace = True)

age_dummies = pd.get_dummies(X_train2['ppagecat'], drop_first = True) 
age_dummies.rename(columns = {2: '25-34', 3: '35-44', 4: '45-54',
                              5: '55-64', 6: '65-74', 7: '75+'}, inplace = True)

inc_dummies = pd.get_dummies(X_train2['PPINCIMP'], drop_first = True)
inc_dummies.rename(columns = {2.0: '20-40k', 3.0: '40-60k', 4.0: '60-85k', 5.0: '85-125k', 
                              6.0: '125-175k', 7.0:'175-200k', 8.0: '200k+'}, inplace = True)

edu_dummies = pd.get_dummies(X_train2['PPEDUCAT'], drop_first = True)
edu_dummies.rename(columns = {2: 'Completed HS', 3: 'Some College', 4: 'Bachelors+'}, inplace = True)

sci_why_dummies = pd.get_dummies(X_train2['SCIWHY_e'], drop_first = True)
sci_why_dummies.rename(columns = {2.0: 'Curious about what\'s happening in science minor reason',
                                 3.0: 'Curious about what\'s happening in science not a reason'}, inplace = True)

topicbus_dummies = pd.get_dummies(X_train2['TOPICINT_d'], drop_first = True)

topicbus_dummies.rename(columns = {2.0: 'BusNews: Somewhat Interested',
                               3.0: 'BusNews: Not Too Interested',
                               4.0: 'BusNews: Not At All Interested'}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [39]:
X_train2 = pd.concat((X_train2, topic_dummies, age_dummies, inc_dummies, edu_dummies, 
                     sci_why_dummies, topicbus_dummies), axis = 1)

X_train2.drop(labels = ['TOPICINT_e', 'ppagecat', 'PPINCIMP', 'PPEDUCAT', 'TOPICINT_d',
                       'SCIWHY_e'], axis = 1, inplace = True)

#### Logistic Regression 2

In [40]:
log_reg2 = LogisticRegression().fit(X_train2, y_train)
cv_lr2 = cross_val_score(log_reg2, X_train2, y_train, scoring = 'accuracy', cv = 5).mean()
cv_lr2

0.6768185129211666

#### Random Forest 2

In [42]:
rf2 = RandomForestClassifier(n_estimators = 100, max_depth=10).fit(X_train2, y_train)
cv_rf2 = cross_val_score(rf2, X_train2, y_train, scoring = 'accuracy', cv = 5).mean()
cv_rf2

0.6583961167659782

#### Gradient Boost 2

In [43]:
## Boosted using Grid Search

In [44]:
gb2 = GradientBoostingClassifier(learning_rate=.1, max_depth = 1, n_estimators = 100, subsample=.25).fit(X_train2, y_train)
cv_gb2 = cross_val_score(gb2, X_train2, y_train, scoring = 'accuracy', cv = 5).mean()
cv_gb2

0.6741114156450017

##### GB2 Grid Search

In [46]:
# gb2_params ={'learning_rate': [.01, .1], 'n_estimators' : [50, 100, 200],
#            'subsample': [.25, .5, .75], 'max_depth': [1, 2, 10]}
# gs_gb2 = GridSearchCV(gb2, gb2_params, scoring = 'accuracy', cv = 5)
# gs_gb2.fit(X_train2, y_train)

In [47]:
#gs_gb2.best_params_

In [48]:
# Best Params {'learning_rate': 0.1, 'max_depth': 1, 'n_estimators': 100, 'subsample': 0.25}
#gs_gb2.best_score_

### Model 3

In [49]:
features3 = ['PPGENDER', 'TOPICINT_e_t2b', 'ppagecat_short', 'PPINCIMP', 'PPEDUCAT']

In [50]:
X_train3 = X_train[features3]

In [51]:
X_train3.head()

Unnamed: 0_level_0,PPGENDER,TOPICINT_e_t2b,ppagecat_short,PPINCIMP,PPEDUCAT
CaseID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3846,1,1.0,3,3.0,4
3293,1,1.0,2,5.0,4
317,2,1.0,2,4.0,4
1119,2,1.0,4,4.0,3
2347,2,1.0,1,3.0,2


In [52]:
# Gender 1: Male 0: Female
X_train3['PPGENDER'] = X_train3['PPGENDER'].apply(lambda x: 1 if x==1 else 0)
X_train3['TOPICINT_e_t2b'] = X_train3['TOPICINT_e_t2b'].apply(lambda x: 1 if x==1.0 else 0)


age_dummies = pd.get_dummies(X_train3['ppagecat_short'], drop_first = True) 
age_dummies.rename(columns = {2: '35-54', 3: '55-64', 4: '75+'}, inplace = True)

inc_dummies = pd.get_dummies(X_train3['PPINCIMP'], drop_first = True)
inc_dummies.rename(columns = {2.0: '20-40k', 3.0: '40-60k', 4.0: '60-85k', 5.0: '85-125k', 
                              6.0: '125-175k', 7.0:'175-200k', 8.0: '200k+'}, inplace = True)

edu_dummies = pd.get_dummies(X_train3['PPEDUCAT'], drop_first = True)
edu_dummies.rename(columns = {2: 'Completed HS', 3: 'Some College', 4: 'Bachelors+'}, inplace = True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [53]:
X_train3 = pd.concat((X_train3, age_dummies, inc_dummies, edu_dummies), axis = 1)

X_train3.drop(labels = ['ppagecat_short', 'PPINCIMP', 'PPEDUCAT'], axis = 1, inplace = True)

In [54]:
X_train3.head()

Unnamed: 0_level_0,PPGENDER,TOPICINT_e_t2b,35-54,55-64,75+,20-40k,40-60k,60-85k,85-125k,125-175k,175-200k,200k+,Completed HS,Some College,Bachelors+
CaseID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
3846,1,1,0,1,0,0,1,0,0,0,0,0,0,0,1
3293,1,1,1,0,0,0,0,0,1,0,0,0,0,0,1
317,0,1,1,0,0,0,0,1,0,0,0,0,0,0,1
1119,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0
2347,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0


#### Logistic Regression 3

In [55]:
log_reg3 = LogisticRegression().fit(X_train3, y_train)
cv_lr3 = cross_val_score(log_reg3, X_train3, y_train, scoring = 'accuracy', cv = 5).mean()
cv_lr3

0.6556476992927058

#### Random Forest 3

In [56]:
rf3 = RandomForestClassifier(n_estimators = 100, max_depth=10).fit(X_train3, y_train)
cv_rf3 = cross_val_score(rf3, X_train3, y_train, scoring = 'accuracy', cv = 5).mean()
cv_rf3

0.6295709647569616

#### Gradient Boost 3

In [57]:
gb3 = GradientBoostingClassifier(learning_rate=.1, max_depth = 1, n_estimators = 100, subsample=.25).fit(X_train3, y_train)
cv_gb3 = cross_val_score(gb3, X_train3, y_train, scoring = 'accuracy', cv = 5).mean()
cv_gb3

0.6561882398332463

### Model 4

In [58]:
features4 = ['PPGENDER', 'TOPICINT_e', 'PPINCIMP', 'PPEDUCAT', 'ppagecat', 'SOURCE1']
X_train4 = X_train[features4]

In [59]:
# Gender 1: Male 0: Female
X_train4['PPGENDER'] = X_train4['PPGENDER'].apply(lambda x: 1 if x==1 else 0)

topic_dummies = pd.get_dummies(X_train4['TOPICINT_e'], drop_first = True)

topic_dummies.rename(columns = {2.0: 'SciNews: Somewhat Interested',
                               3.0: 'SciNews: Not Too Interested',
                               4.0: 'SciNews: Not At All Interested'}, inplace = True)

age_dummies = pd.get_dummies(X_train4['ppagecat'], drop_first = True) 
age_dummies.rename(columns = {2: '25-34', 3: '35-44', 4: '45-54',
                              5: '55-64', 6: '65-74', 7: '75+'}, inplace = True)

inc_dummies = pd.get_dummies(X_train4['PPINCIMP'], drop_first = True)
inc_dummies.rename(columns = {2.0: '20-40k', 3.0: '40-60k', 4.0: '60-85k', 5.0: '85-125k', 
                              6.0: '125-175k', 7.0:'175-200k', 8.0: '200k+'}, inplace = True)

edu_dummies = pd.get_dummies(X_train4['PPEDUCAT'], drop_first = True)
edu_dummies.rename(columns = {2: 'Completed HS', 3: 'Some College', 4: 'Bachelors+'}, inplace = True)

X_train4['SOURCE1'] = X_train4['SOURCE1'].apply(lambda x: 1 if x==1.0 else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [60]:
X_train4 = pd.concat((X_train4, topic_dummies, inc_dummies, age_dummies, edu_dummies), axis = 1)
X_train4.drop(labels = ['TOPICINT_e', 'PPINCIMP', 'PPEDUCAT', 'ppagecat'], axis = 1, inplace = True)

#### Logistic Regression 4

In [61]:
log_reg4 = LogisticRegression().fit(X_train4, y_train)
cv_lr4 = cross_val_score(log_reg4, X_train4, y_train, scoring = 'accuracy', cv = 5).mean()
cv_lr4

0.6539937867615275

#### Random Forest 4

In [62]:
rf4 = RandomForestClassifier(n_estimators = 100, max_depth=10).fit(X_train4, y_train)
cv_rf4 = cross_val_score(rf4, X_train4, y_train, scoring = 'accuracy', cv = 5).mean()
cv_rf4

0.6360691781107016

#### Gradient Boost 4

In [63]:
gb4 = GradientBoostingClassifier(learning_rate=.1, max_depth = 1, n_estimators = 100, subsample=.25).fit(X_train4, y_train)
cv_gb4 = cross_val_score(gb4, X_train4, y_train, scoring = 'accuracy', cv = 5).mean()
cv_gb4

0.657818746658043

- Can use ordinal variables without making dummies as long as the scale is the same (look into this)
- Build a model just using demos, etc.
- Compare to the .predict method from Gaussian Mixture Model, have both to see how (ad targeting) would compare

### Model 5
- Remove Income

In [171]:
features5 = ['PPGENDER', 'TOPICINT_e', 'ppagecat', 'PPEDUCAT', 'SCIWHY_e', 'TOPICINT_d']
X_train5 = X_train[features5]
X_test5 = X_test[features5]

In [172]:
X_train5.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1841 entries, 3846 to 3897
Data columns (total 6 columns):
PPGENDER      1841 non-null int64
TOPICINT_e    1841 non-null float64
ppagecat      1841 non-null int64
PPEDUCAT      1841 non-null int64
SCIWHY_e      1841 non-null float64
TOPICINT_d    1841 non-null float64
dtypes: float64(3), int64(3)
memory usage: 180.7 KB


In [173]:
X_train5.head(10)

Unnamed: 0_level_0,PPGENDER,TOPICINT_e,ppagecat,PPEDUCAT,SCIWHY_e,TOPICINT_d
CaseID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3846,1,1.0,5,4,1.0,2.0
3293,1,2.0,3,4,1.0,1.0
317,2,2.0,3,4,1.0,4.0
1119,2,2.0,7,3,2.0,1.0
2347,2,2.0,2,2,2.0,3.0
3218,2,2.0,5,4,1.0,3.0
1443,1,2.0,4,1,2.0,3.0
977,1,2.0,6,4,2.0,2.0
360,1,2.0,4,2,2.0,3.0
2730,1,4.0,5,1,3.0,4.0


In [176]:
# Gender 1: Male 0: Female
X_train5['PPGENDER'] = X_train5['PPGENDER'].apply(lambda x: 1 if x==1 else 0)

topic_dummies = pd.get_dummies(X_train5['TOPICINT_e'], drop_first = True)
topic_dummies.rename(columns = {2.0: 'SciNews: Somewhat Interested',
                               3.0: 'SciNews: Not Too Interested',
                               4.0: 'SciNews: Not At All Interested'}, inplace = True)

age_dummies = pd.get_dummies(X_train5['ppagecat'], drop_first = True) 
age_dummies.rename(columns = {2: '25-34', 3: '35-44', 4: '45-54',
                              5: '55-64', 6: '65-74', 7: '75+'}, inplace = True)


edu_dummies = pd.get_dummies(X_train5['PPEDUCAT'], drop_first = True)
edu_dummies.rename(columns = {2: 'Completed HS', 3: 'Some College', 4: 'Bachelors+'}, inplace = True)

sci_why_dummies = pd.get_dummies(X_train5['SCIWHY_e'], drop_first = True)
sci_why_dummies.rename(columns = {2.0: 'Curious about what\'s happening in science minor reason',
                                 3.0: 'Curious about what\'s happening in science not a reason'}, inplace = True)

topicbus_dummies = pd.get_dummies(X_train5['TOPICINT_d'], drop_first = True)

topicbus_dummies.rename(columns = {2.0: 'BusNews: Somewhat Interested',
                               3.0: 'BusNews: Not Too Interested',
                               4.0: 'BusNews: Not At All Interested'}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [177]:
X_train5 = pd.concat((X_train5, topic_dummies, age_dummies, edu_dummies, 
                     sci_why_dummies, topicbus_dummies), axis = 1)

X_train5.drop(labels = ['TOPICINT_e', 'ppagecat', 'PPEDUCAT', 'TOPICINT_d',
                       'SCIWHY_e'], axis = 1, inplace = True)

In [178]:
X_train5.head(5)

Unnamed: 0_level_0,PPGENDER,SciNews: Somewhat Interested,SciNews: Not Too Interested,SciNews: Not At All Interested,25-34,35-44,45-54,55-64,65-74,75+,Completed HS,Some College,Bachelors+,Curious about what's happening in science minor reason,Curious about what's happening in science not a reason,BusNews: Somewhat Interested,BusNews: Not Too Interested,BusNews: Not At All Interested
CaseID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
3846,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0
3293,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0
317,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1
1119,0,1,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0
2347,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0


In [179]:
X_train5.shape

(1841, 18)

#### Logistic Regression 5

In [180]:
log_reg5 = LogisticRegression().fit(X_train5, y_train)
cv_lr5 = cross_val_score(log_reg5, X_train5, y_train, scoring = 'accuracy', cv = 5).mean()
cv_lr5

0.6735620619434741

In [181]:
#log_reg5.score(X_test5, y_test)

#### Random Forest 5

In [182]:
rf5 = RandomForestClassifier(n_estimators = 100, max_depth=10).fit(X_train5, y_train)
cv_rf5 = cross_val_score(rf5, X_train5, y_train, scoring = 'accuracy', cv = 5).mean()
cv_rf5

0.6469638700422967

#### Gradient Boost 5

In [183]:
gb5 = GradientBoostingClassifier(learning_rate=.1, max_depth = 2, n_estimators = 50, subsample=.5).fit(X_train5, y_train)
cv_gb5 = cross_val_score(gb5, X_train5, y_train, scoring = 'accuracy', cv = 5).mean()
cv_gb5

0.673572355971657

In [184]:
# gb5.score(X_test5, y_test)

### Model 5 Optimization

In [24]:
features5a = ['PPGENDER', 'TOPICINT_e', 'ppagecat', 'PPEDUCAT', 'SCIWHY_e', 'TOPICINT_d', 'LIST1_b', 'TOPICINT_a',
             'SOURCE1']
X_train5a = X_train[features5a]
X_test5a = X_test[features5a]

In [26]:
X_train5a.head()

Unnamed: 0_level_0,PPGENDER,TOPICINT_e,ppagecat,PPEDUCAT,SCIWHY_e,TOPICINT_d,LIST1_b,TOPICINT_a,SOURCE1
CaseID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
3846,1,1.0,5,4,1.0,2.0,1,1.0,2.0
3293,1,2.0,3,4,1.0,1.0,0,1.0,1.0
317,2,2.0,3,4,1.0,4.0,0,3.0,1.0
1119,2,2.0,7,3,2.0,1.0,0,1.0,2.0
2347,2,2.0,2,2,2.0,3.0,0,2.0,1.0


In [27]:
# Gender 1: Male 0: Female
X_train5a['PPGENDER'] = X_train5a['PPGENDER'].apply(lambda x: 1 if x==1 else 0)

# Source 1: Specific sources 0: Many sources
X_train5a['SOURCE1'] = X_train5a['SOURCE1'].apply(lambda x: 1 if x==1.0 else 0)

topic_dummies = pd.get_dummies(X_train5a['TOPICINT_e'], drop_first = True)
topic_dummies.rename(columns = {2.0: 'SciNews: Somewhat Interested',
                               3.0: 'SciNews: Not Too Interested',
                               4.0: 'SciNews: Not At All Interested'}, inplace = True)

age_dummies = pd.get_dummies(X_train5a['ppagecat'], drop_first = True) 
age_dummies.rename(columns = {2: '25-34', 3: '35-44', 4: '45-54',
                              5: '55-64', 6: '65-74', 7: '75+'}, inplace = True)


edu_dummies = pd.get_dummies(X_train5a['PPEDUCAT'], drop_first = True)
edu_dummies.rename(columns = {2: 'Completed HS', 3: 'Some College', 4: 'Bachelors+'}, inplace = True)

sci_why_dummies = pd.get_dummies(X_train5a['SCIWHY_e'], drop_first = True)
sci_why_dummies.rename(columns = {2.0: 'Curious about what\'s happening in science minor reason',
                                 3.0: 'Curious about what\'s happening in science not a reason'}, inplace = True)

topicbus_dummies = pd.get_dummies(X_train5a['TOPICINT_d'], drop_first = True)
topicbus_dummies.rename(columns = {2.0: 'BusNews: Somewhat Interested',
                               3.0: 'BusNews: Not Too Interested',
                               4.0: 'BusNews: Not At All Interested'}, inplace = True)

topicgov_dummies = pd.get_dummies(X_train5a['TOPICINT_a'], drop_first = True)
topicgov_dummies.rename(columns = {2.0: 'GovNews: Somewhat Interested',
                               3.0: 'GovNews: Not Too Interested',
                               4.0: 'GovNews: Not At All Interested'}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [29]:
X_train5a = pd.concat((X_train5a, topic_dummies, age_dummies, edu_dummies, 
                     sci_why_dummies, topicbus_dummies, topicgov_dummies), axis = 1)

X_train5a.drop(labels = ['TOPICINT_e', 'ppagecat', 'PPEDUCAT', 'TOPICINT_d',
                       'SCIWHY_e', 'TOPICINT_a'], axis = 1, inplace = True)

In [32]:
X_train5a.shape

(1841, 23)

In [46]:
X_train5a.head()

Unnamed: 0_level_0,PPGENDER,LIST1_b,SOURCE1,SciNews: Somewhat Interested,SciNews: Not Too Interested,SciNews: Not At All Interested,25-34,35-44,45-54,55-64,...,Some College,Bachelors+,Curious about what's happening in science minor reason,Curious about what's happening in science not a reason,BusNews: Somewhat Interested,BusNews: Not Too Interested,BusNews: Not At All Interested,GovNews: Somewhat Interested,GovNews: Not Too Interested,GovNews: Not At All Interested
CaseID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3846,1,1,0,0,0,0,0,0,0,1,...,0,1,0,0,1,0,0,0,0,0
3293,1,0,1,1,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
317,0,0,1,1,0,0,0,1,0,0,...,0,1,0,0,0,0,1,0,1,0
1119,0,0,0,1,0,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0
2347,0,0,1,1,0,0,1,0,0,0,...,0,0,1,0,0,1,0,1,0,0


#### Logistic Regression 5a

In [34]:
log_reg5a = LogisticRegression().fit(X_train5a, y_train)
cv_lr5a = cross_val_score(log_reg5a, X_train5a, y_train, scoring = 'accuracy', cv = 5).mean()
cv_lr5a

0.6713925194593953

#### Random Forest 5a

In [35]:
rf5a = RandomForestClassifier(n_estimators = 100, max_depth=10).fit(X_train5a, y_train)
cv_rf5a = cross_val_score(rf5a, X_train5a, y_train, scoring = 'accuracy', cv = 5).mean()
cv_rf5a

0.6626968432714196

#### Gradient Boost 5a
- Optimized using Grid Search

In [44]:
gb5a = GradientBoostingClassifier(learning_rate=.1, max_depth = 1, n_estimators = 200, subsample=.75).fit(X_train5a, y_train)
cv_gb5a = cross_val_score(gb5a, X_train5a, y_train, scoring = 'accuracy', cv = 5).mean()
cv_gb5a

0.6708269722749641

##### Grid Search

In [40]:
# gb5a_params ={'learning_rate': [.001, .01, .1], 'n_estimators' : [100, 200, 500],
#            'subsample': [.25, .5, .75], 'max_depth': [1, 2, 5, 10]}
# gs_gb5a = GridSearchCV(gb5a, gb5a_params, scoring = 'accuracy', cv = 5)
# gs_gb5a.fit(X_train5a, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=2,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=50,
              presort='auto', random_state=None, subsample=0.5, verbose=0,
              warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'learning_rate': [0.001, 0.01, 0.1], 'n_estimators': [100, 200, 500], 'subsample': [0.25, 0.5, 0.75], 'max_depth': [1, 2, 5, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [43]:
# gs_gb5a.best_params_
## BEST PARAMETERS: {'learning_rate': 0.1, 'max_depth': 1, 'n_estimators': 200, 'subsample': 0.75}
# gs_gb5a.best_score_
## BEST SCORE: 0.6735469853340575

##### Test Score

In [37]:
# Gender 1: Male 0: Female
X_test5a['PPGENDER'] = X_test5a['PPGENDER'].apply(lambda x: 1 if x==1 else 0)

# Source 1: Specific sources 0: Many sources
X_test5a['SOURCE1'] = X_test5a['SOURCE1'].apply(lambda x: 1 if x==1.0 else 0)

topic_dummies_test = pd.get_dummies(X_test5a['TOPICINT_e'], drop_first = True)
topic_dummies_test.rename(columns = {2.0: 'SciNews: Somewhat Interested',
                               3.0: 'SciNews: Not Too Interested',
                               4.0: 'SciNews: Not At All Interested'}, inplace = True)

age_dummies_test = pd.get_dummies(X_test5a['ppagecat'], drop_first = True) 
age_dummies_test.rename(columns = {2: '25-34', 3: '35-44', 4: '45-54',
                              5: '55-64', 6: '65-74', 7: '75+'}, inplace = True)


edu_dummies_test = pd.get_dummies(X_test5a['PPEDUCAT'], drop_first = True)
edu_dummies_test.rename(columns = {2: 'Completed HS', 3: 'Some College', 4: 'Bachelors+'}, inplace = True)

sci_why_dummies_test = pd.get_dummies(X_test5a['SCIWHY_e'], drop_first = True)
sci_why_dummies_test.rename(columns = {2.0: 'Curious about what\'s happening in science minor reason',
                                 3.0: 'Curious about what\'s happening in science not a reason'}, inplace = True)

topicbus_dummies_test = pd.get_dummies(X_test5a['TOPICINT_d'], drop_first = True)
topicbus_dummies_test.rename(columns = {2.0: 'BusNews: Somewhat Interested',
                               3.0: 'BusNews: Not Too Interested',
                               4.0: 'BusNews: Not At All Interested'}, inplace = True)

topicgov_dummies_test = pd.get_dummies(X_test5a['TOPICINT_a'], drop_first = True)
topicgov_dummies_test.rename(columns = {2.0: 'GovNews: Somewhat Interested',
                               3.0: 'GovNews: Not Too Interested',
                               4.0: 'GovNews: Not At All Interested'}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [38]:
X_test5a = pd.concat((X_test5a, topic_dummies_test, age_dummies_test, edu_dummies_test, 
                     sci_why_dummies_test, topicbus_dummies_test, topicgov_dummies_test), axis = 1)

X_test5a.drop(labels = ['TOPICINT_e', 'ppagecat', 'PPEDUCAT', 'TOPICINT_d',
                       'SCIWHY_e', 'TOPICINT_a'], axis = 1, inplace = True)

In [45]:
gb5a.score(X_test5a, y_test)

0.6850220264317181

## MAKE PREDICTIONS

In [185]:
X_train5.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1841 entries, 3846 to 3897
Data columns (total 18 columns):
PPGENDER                                                  1841 non-null int64
SciNews: Somewhat Interested                              1841 non-null uint8
SciNews: Not Too Interested                               1841 non-null uint8
SciNews: Not At All Interested                            1841 non-null uint8
25-34                                                     1841 non-null uint8
35-44                                                     1841 non-null uint8
45-54                                                     1841 non-null uint8
55-64                                                     1841 non-null uint8
65-74                                                     1841 non-null uint8
75+                                                       1841 non-null uint8
Completed HS                                              1841 non-null uint8
Some College                    

In [186]:
X_train5.head()

Unnamed: 0_level_0,PPGENDER,SciNews: Somewhat Interested,SciNews: Not Too Interested,SciNews: Not At All Interested,25-34,35-44,45-54,55-64,65-74,75+,Completed HS,Some College,Bachelors+,Curious about what's happening in science minor reason,Curious about what's happening in science not a reason,BusNews: Somewhat Interested,BusNews: Not Too Interested,BusNews: Not At All Interested
CaseID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
3846,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0
3293,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0
317,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1
1119,0,1,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0
2347,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0


In [198]:
gender = 1
age = 3
education = 4
busnews = 4.0
scinews = 4.0
sciwhy = 3.0

In [199]:
d = {'PPGENDER': [gender], 'ppagecat': [age],
     'PPEDUCAT': [education], 'TOPICINT_d': [busnews],
     'TOPICINT_e': [scinews], 'SCIWHY_e': [sciwhy]}

In [200]:
X_new = pd.DataFrame(data = d)

In [201]:
X_new['PPGENDER'] = gender
X_new['ppagecat'] = age
X_new['PPEDUCAT'] = education
X_new['TOPICINT_d'] = busnews
X_new['TOPICINT_e'] = scinews
X_new['SCIWHY_e'] = sciwhy

In [203]:
X_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 6 columns):
PPGENDER      1 non-null int64
ppagecat      1 non-null int64
PPEDUCAT      1 non-null int64
TOPICINT_d    1 non-null float64
TOPICINT_e    1 non-null float64
SCIWHY_e      1 non-null float64
dtypes: float64(3), int64(3)
memory usage: 128.0 bytes


In [196]:
def transform_new(X_new):
    # Transforms gender column
    X_new['PPGENDER'] = X_new['PPGENDER'].apply(lambda x: 1 if x==1 else 0)
    
    # Transforms age column
    if X_new['ppagecat'][0] == 1:
        d = {'18-24': [1], '25-34': [0], '35-44': [0], '45-54': [0],
             '55-64': [0], '65-74': [0], '75+': [0]}
        age_dummies = pd.DataFrame(data = d)
        
    elif X_new['ppagecat'][0] == 2:
        d = {'18-24': [0], '25-34': [1], '35-44': [0], '45-54': [0],
             '55-64': [0], '65-74': [0], '75+': [0]}
        age_dummies = pd.DataFrame(data = d)
        
    elif X_new['ppagecat'][0] == 3:
        d = {'18-24': [0], '25-34': [0], '35-44': [1], '45-54': [0],
             '55-64': [0], '65-74': [0], '75+': [0]}
        age_dummies = pd.DataFrame(data = d)
        
        
    elif X_new['ppagecat'][0] == 4:
        d = {'18-24': [0], '25-34': [0], '35-44': [0], '45-54': [1],
             '55-64': [0], '65-74': [0], '75+': [0]}
        age_dummies = pd.DataFrame(data = d)
        
        
    elif X_new['ppagecat'][0] == 5:
        d = {'18-24': [0], '25-34': [0], '35-44': [0], '45-54': [0],
             '55-64': [1], '65-74': [0], '75+': [0]}
        age_dummies = pd.DataFrame(data = d)
        
        
    elif X_new['ppagecat'][0] == 6:
        d = {'18-24': [0], '25-34': [0], '35-44': [0], '45-54': [0],
             '55-64': [0], '65-74': [1], '75+': [0]}
        age_dummies = pd.DataFrame(data = d)
       
    
    elif X_new['ppagecat'][0] == 7:
        d = {'18-24': [0], '25-34': [0], '35-44': [0], '45-54': [0],
             '55-64': [0], '65-74': [0], '75+': [1]}
        age_dummies = pd.DataFrame(data = d)
        
    
    # Transforms education column
    if X_new['PPEDUCAT'][0] == 1:
        d = {'Less than HS': [1], 'Completed HS': [0], 'Some College': [0], 
             'Bachelors+': [0]}
        edu_dummies = pd.DataFrame(data = d)
        
    elif X_new['PPEDUCAT'][0] == 2:
        d = {'Less than HS': [0], 'Completed HS': [1], 'Some College': [0], 
             'Bachelors+': [0]}
        edu_dummies = pd.DataFrame(data = d)
        
    elif X_new['PPEDUCAT'][0] == 3:
        d = {'Less than HS': [0], 'Completed HS': [0], 'Some College': [1], 
             'Bachelors+': [0]}
        edu_dummies = pd.DataFrame(data = d)
        
    elif X_new['PPEDUCAT'][0] == 4:
        d = {'Less than HS': [0], 'Completed HS': [0], 'Some College': [0], 
             'Bachelors+': [1]}
        edu_dummies = pd.DataFrame(data = d)
        
    # Transforms Topic Int d column
    if X_new['TOPICINT_d'][0] == 1.0:
        d = {'BusNews: Very Interested': [1], 
             'BusNews: Somewhat Interested': [0], 
             'BusNews: Not Too Interested': [0], 
             'BusNews: Not At All Interested': [0]}
        busnews_dummies = pd.DataFrame(data = d)
        
    elif X_new['TOPICINT_d'][0] == 2.0:
        d = {'BusNews: Very Interested': [0], 
             'BusNews: Somewhat Interested': [1], 
             'BusNews: Not Too Interested': [0], 
             'BusNews: Not At All Interested': [0]}
        busnews_dummies = pd.DataFrame(data = d)
        
    elif X_new['TOPICINT_d'][0] == 3.0:
        d = {'BusNews: Very Interested': [0], 
             'BusNews: Somewhat Interested': [0], 
             'BusNews: Not Too Interested': [1], 
             'BusNews: Not At All Interested': [0]}
        busnews_dummies = pd.DataFrame(data = d)
        
    elif X_new['TOPICINT_d'][0] == 4.0:
        d = {'BusNews: Very Interested': [0], 
             'BusNews: Somewhat Interested': [0], 
             'BusNews: Not Too Interested': [0], 
             'BusNews: Not At All Interested': [1]}
        busnews_dummies = pd.DataFrame(data = d)
        
    # Transforms Topic Int e column
    if X_new['TOPICINT_e'][0] == 1.0:
        d = {'SciNews: Very Interested': [1], 
             'SciNews: Somewhat Interested': [0], 
             'SciNews: Not Too Interested': [0], 
             'SciNews: Not At All Interested': [0]}
        scinews_dummies = pd.DataFrame(data = d)
        
    elif X_new['TOPICINT_e'][0] == 2.0:
        d = {'SciNews: Very Interested': [0], 
             'SciNews: Somewhat Interested': [1], 
             'SciNews: Not Too Interested': [0], 
             'SciNews: Not At All Interested': [0]}
        scinews_dummies = pd.DataFrame(data = d)
        
    elif X_new['TOPICINT_e'][0] == 3.0:
        d = {'SciNews: Very Interested': [0], 
             'SciNews: Somewhat Interested': [0], 
             'SciNews: Not Too Interested': [1], 
             'SciNews: Not At All Interested': [0]}
        scinews_dummies = pd.DataFrame(data = d)
        
    elif X_new['TOPICINT_e'][0] == 4.0:
        d = {'SciNews: Very Interested': [0], 
             'SciNews: Somewhat Interested': [0], 
             'SciNews: Not Too Interested': [0], 
             'SciNews: Not At All Interested': [1]}
        scinews_dummies = pd.DataFrame(data = d)
        
    # Transforms Sci Why column
    if X_new['SCIWHY_e'][0] == 1.0:
        d = {'Curious about what\'s happening in science major reason': [1], 
             'Curious about what\'s happening in science minor reason': [0], 
             'Curious about what\'s happening in science not a reason': [0]}
        sciwhy_dummies = pd.DataFrame(data = d)
        
    elif X_new['SCIWHY_e'][0] == 2.0:
        d = {'Curious about what\'s happening in science major reason': [0], 
             'Curious about what\'s happening in science minor reason': [1], 
             'Curious about what\'s happening in science not a reason': [0]}
        sciwhy_dummies = pd.DataFrame(data = d)
        
    elif X_new['SCIWHY_e'][0] == 3.0:
        d = {'Curious about what\'s happening in science major reason': [0], 
             'Curious about what\'s happening in science minor reason': [0], 
             'Curious about what\'s happening in science not a reason': [1]}
        sciwhy_dummies = pd.DataFrame(data = d)
        
        
    age_dummies.drop(columns = '18-24', axis = 1, inplace = True)
    edu_dummies.drop(columns = 'Less than HS', axis = 1, inplace = True)
    busnews_dummies.drop(columns = 'BusNews: Very Interested', axis = 1, inplace = True)
    scinews_dummies.drop(columns = 'SciNews: Very Interested', axis = 1, inplace = True)
    sciwhy_dummies.drop(columns = 'Curious about what\'s happening in science major reason', axis = 1, inplace = True)
        
    
    X_new = pd.concat((X_new, scinews_dummies, age_dummies, edu_dummies, 
                       sciwhy_dummies, busnews_dummies), axis = 1)

    X_new.drop(labels = ['TOPICINT_e', 'ppagecat', 'PPEDUCAT', 'TOPICINT_d',
                       'SCIWHY_e'], axis = 1, inplace = True)
    
    return X_new
    
    
    
    

In [197]:
transform_new(X_new)

Unnamed: 0,PPGENDER,SciNews: Somewhat Interested,SciNews: Not Too Interested,SciNews: Not At All Interested,25-34,35-44,45-54,55-64,65-74,75+,Completed HS,Some College,Bachelors+,Curious about what's happening in science minor reason,Curious about what's happening in science not a reason,BusNews: Somewhat Interested,BusNews: Not Too Interested,BusNews: Not At All Interested
0,1,0,0,1,0,1,0,0,0,0,0,0,1,0,1,0,0,1


In [195]:
X_train5

Unnamed: 0_level_0,PPGENDER,SciNews: Somewhat Interested,SciNews: Not Too Interested,SciNews: Not At All Interested,25-34,35-44,45-54,55-64,65-74,75+,Completed HS,Some College,Bachelors+,Curious about what's happening in science minor reason,Curious about what's happening in science not a reason,BusNews: Somewhat Interested,BusNews: Not Too Interested,BusNews: Not At All Interested
CaseID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
3846,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0
3293,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0
317,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1
1119,0,1,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0
2347,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0
3218,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0
1443,1,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0
977,1,1,0,0,0,0,0,0,1,0,0,0,1,1,0,1,0,0
360,1,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0
2730,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,1


In [None]:
X_train5 = pd.concat((X_train5, topic_dummies, age_dummies, edu_dummies, 
                     sci_why_dummies, topicbus_dummies), axis = 1)

X_train5.drop(labels = ['TOPICINT_e', 'ppagecat', 'PPEDUCAT', 'TOPICINT_d',
                       'SCIWHY_e'], axis = 1, inplace = True)

In [378]:
# Gender 1: Male 0: Female
X_new['PPGENDER'] = X_new['PPGENDER'].apply(lambda x: 1 if x==1 else 0)

topic_dummies = pd.get_dummies(X_new['TOPICINT_e'], drop_first = True)
topic_dummies.rename(columns = {2.0: 'SciNews: Somewhat Interested',
                               3.0: 'SciNews: Not Too Interested',
                               4.0: 'SciNews: Not At All Interested'}, inplace = True)

age_dummies = pd.get_dummies(X_new['ppagecat'], drop_first = True) 
age_dummies.rename(columns = {2: '25-34', 3: '35-44', 4: '45-54',
                              5: '55-64', 6: '65-74', 7: '75+'}, inplace = True)


edu_dummies = pd.get_dummies(X_new['PPEDUCAT'], drop_first = True)
edu_dummies.rename(columns = {2: 'Completed HS', 3: 'Some College', 4: 'Bachelors+'}, inplace = True)

sci_why_dummies = pd.get_dummies(X_new['SCIWHY_e'], drop_first = True)
sci_why_dummies.rename(columns = {2.0: 'Curious about what\'s happening in science minor reason',
                                 3.0: 'Curious about what\'s happening in science not a reason'}, inplace = True)

topicbus_dummies = pd.get_dummies(X_new['TOPICINT_d'], drop_first = True)

topicbus_dummies.rename(columns = {2.0: 'BusNews: Somewhat Interested',
                               3.0: 'BusNews: Not Too Interested',
                               4.0: 'BusNews: Not At All Interested'}, inplace = True)

In [102]:
age_dummies

Unnamed: 0_level_0,25-34,35-44,45-54,55-64,65-74,75+
CaseID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3846,0,0,0,1,0,0
3293,0,1,0,0,0,0
317,0,1,0,0,0,0
1119,0,0,0,0,0,1
2347,1,0,0,0,0,0
3218,0,0,0,1,0,0
1443,0,0,1,0,0,0
977,0,0,0,0,1,0
360,0,0,1,0,0,0
2730,0,0,0,1,0,0


In [379]:
X_newT = pd.concat((X_new, topic_dummies, age_dummies, edu_dummies, 
                     sci_why_dummies, topicbus_dummies), axis = 1)

X_newT.drop(labels = ['TOPICINT_e', 'ppagecat', 'PPEDUCAT', 'TOPICINT_d',
                       'SCIWHY_e'], axis = 1, inplace = True)

In [380]:
gb5.predict(X_newT)

ValueError: Number of features of the model must match the input. Model n_features is 18 and input n_features is 1 

##### Grid Search GB5

In [264]:
gb5_params ={'learning_rate': [.01, .1], 'n_estimators' : [50, 100, 200],
           'subsample': [.25, .5, .75], 'max_depth': [1, 2, 10]}
gs_gb5 = GridSearchCV(gb5, gb5_params, scoring = 'accuracy', cv = 5)
gs_gb5.fit(X_train5, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=1,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=0.25, verbose=0,
              warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'learning_rate': [0.01, 0.1], 'n_estimators': [50, 100, 200], 'subsample': [0.25, 0.5, 0.75], 'max_depth': [1, 2, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [265]:
# Best Params{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 50, 'subsample': 0.5}
gs_gb5.best_score_

0.6751765344921239

In [269]:
gs_gb5.best_params_

{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 50, 'subsample': 0.5}

### Model 6
- Add TopicIntA

In [15]:
features6 = ['PPGENDER', 'TOPICINT_e', 'ppagecat', 'PPEDUCAT', 'TOPICINT_d', 'TOPICINT_a', 'SCIWHY_e']
X_train6 = X_train[features6]

In [16]:
# Gender 1: Male 0: Female
X_train6['PPGENDER'] = X_train6['PPGENDER'].apply(lambda x: 1 if x==1 else 0)

topic_dummies = pd.get_dummies(X_train6['TOPICINT_e'], drop_first = True)
topic_dummies.rename(columns = {2.0: 'SciNews: Somewhat Interested',
                               3.0: 'SciNews: Not Too Interested',
                               4.0: 'SciNews: Not At All Interested'}, inplace = True)

age_dummies = pd.get_dummies(X_train6['ppagecat'], drop_first = True) 
age_dummies.rename(columns = {2: '25-34', 3: '35-44', 4: '45-54',
                              5: '55-64', 6: '65-74', 7: '75+'}, inplace = True)


edu_dummies = pd.get_dummies(X_train6['PPEDUCAT'], drop_first = True)
edu_dummies.rename(columns = {2: 'Completed HS', 3: 'Some College', 4: 'Bachelors+'}, inplace = True)


topicbus_dummies = pd.get_dummies(X_train6['TOPICINT_d'], drop_first = True)

topicbus_dummies.rename(columns = {2.0: 'BusNews: Somewhat Interested',
                               3.0: 'BusNews: Not Too Interested',
                               4.0: 'BusNews: Not At All Interested'}, inplace = True)

topicgov_dummies = pd.get_dummies(X_train6['TOPICINT_a'], drop_first = True)

topicgov_dummies.rename(columns = {2.0: 'GovNews: Somewhat Interested',
                               3.0: 'GovNews: Not Too Interested',
                               4.0: 'GovNews: Not At All Interested'}, inplace = True)

sci_why_dummies = pd.get_dummies(X_train6['SCIWHY_e'], drop_first = True)
sci_why_dummies.rename(columns = {2.0: 'Curious about what\'s happening in science minor reason',
                                 3.0: 'Curious about what\'s happening in science not a reason'}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [17]:
X_train6 = pd.concat((X_train6, topic_dummies, age_dummies, edu_dummies, 
                     topicbus_dummies, sci_why_dummies, topicgov_dummies), axis = 1)

X_train6.drop(labels = ['TOPICINT_e', 'ppagecat', 'PPEDUCAT', 'TOPICINT_d', 'TOPICINT_a', 'SCIWHY_e'], 
              axis = 1, inplace = True)

In [18]:
X_train6.shape

(1841, 21)

In [22]:
X_train6

Unnamed: 0_level_0,PPGENDER,SciNews: Somewhat Interested,SciNews: Not Too Interested,SciNews: Not At All Interested,25-34,35-44,45-54,55-64,65-74,75+,...,Some College,Bachelors+,BusNews: Somewhat Interested,BusNews: Not Too Interested,BusNews: Not At All Interested,Curious about what's happening in science minor reason,Curious about what's happening in science not a reason,GovNews: Somewhat Interested,GovNews: Not Too Interested,GovNews: Not At All Interested
CaseID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3846,1,0,0,0,0,0,0,1,0,0,...,0,1,1,0,0,0,0,0,0,0
3293,1,1,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
317,0,1,0,0,0,1,0,0,0,0,...,0,1,0,0,1,0,0,0,1,0
1119,0,1,0,0,0,0,0,0,0,1,...,1,0,0,0,0,1,0,0,0,0
2347,0,1,0,0,1,0,0,0,0,0,...,0,0,0,1,0,1,0,1,0,0
3218,0,1,0,0,0,0,0,1,0,0,...,0,1,0,1,0,0,0,0,0,0
1443,1,1,0,0,0,0,1,0,0,0,...,0,0,0,1,0,1,0,0,0,0
977,1,1,0,0,0,0,0,0,1,0,...,0,1,1,0,0,1,0,1,0,0
360,1,1,0,0,0,0,1,0,0,0,...,0,0,0,1,0,1,0,1,0,0
2730,1,0,0,1,0,0,0,1,0,0,...,0,0,0,0,1,0,1,1,0,0


#### Logistic Regression 6

In [19]:
log_reg6 = LogisticRegression().fit(X_train6, y_train)
cv_lr6 = cross_val_score(log_reg6, X_train6, y_train, scoring = 'accuracy', cv = 5).mean()
cv_lr6

0.6730244591232626

#### Random Forest 6

In [20]:
rf6 = RandomForestClassifier(n_estimators = 100, max_depth=10).fit(X_train6, y_train)
cv_rf6 = cross_val_score(rf6, X_train6, y_train, scoring = 'accuracy', cv = 5).mean()
cv_rf6

0.6545813788554578

#### Gradient Boost 6

In [21]:
gb6 = GradientBoostingClassifier(learning_rate=.1, max_depth = 1, n_estimators = 100, subsample=.25).fit(X_train6, y_train)
cv_gb6 = cross_val_score(gb6, X_train6, y_train, scoring = 'accuracy', cv = 5).mean()
cv_gb6

0.6681331067473113