# Kaggle Submission

The following is my implementation of a pipeline. It is essentially me walking through the pipeline implementation by Pablo Acuna in this [kernel](https://www.kaggle.com/pacuna/pipeline-example)

### Import yo stuff

In [25]:
import pandas as pd
import numpy as np

In [26]:
test = pd.read_csv("test_census.csv")

In [27]:
data = pd.read_csv("census.csv")

In [28]:
data.isna().any()

age                False
workclass          False
education_level    False
education-num      False
marital-status     False
occupation         False
relationship       False
race               False
sex                False
capital-gain       False
capital-loss       False
hours-per-week     False
native-country     False
income             False
dtype: bool

In [29]:
test.isna().any()

Unnamed: 0         False
age                 True
workclass           True
education_level     True
education-num       True
marital-status      True
occupation          True
relationship        True
race                True
sex                 True
capital-gain        True
capital-loss        True
hours-per-week      True
native-country      True
dtype: bool

In [30]:
test = test.drop('Unnamed: 0', axis = 1)
test.head()

Unnamed: 0,age,workclass,education_level,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,21.0,Private,10th,6.0,Married-civ-spouse,Craft-repair,Husband,White,Male,0.0,0.0,40.0,United-States
1,49.0,Private,Bachelors,13.0,Married-civ-spouse,Adm-clerical,Wife,White,Female,0.0,0.0,40.0,United-States
2,44.0,Self-emp-not-inc,Assoc-acdm,12.0,Married-civ-spouse,Other-service,Wife,White,Female,0.0,0.0,99.0,United-States
3,34.0,Private,Bachelors,13.0,Married-civ-spouse,Sales,Husband,White,Male,7298.0,0.0,46.0,United-States
4,24.0,Private,HS-grad,9.0,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0.0,0.0,40.0,United-States


In [177]:
test['age'].mean()

38.54998340891494

In [178]:
test['age'].fillna(38)
test['age'].isna().values.any()

True

In [179]:
inds = pd.isnull(test).any(1).nonzero()

In [180]:
test['age'].fillna(value=38, inplace = True)

In [181]:
test.isna().any()

age                False
workclass           True
education_level     True
education-num       True
marital-status      True
occupation          True
relationship        True
race                True
sex                 True
capital-gain        True
capital-loss        True
hours-per-week      True
native-country      True
dtype: bool

In [182]:
test['workclass'].mode()

0     Private
dtype: object

In [183]:
test['workclass'].fillna(value="Private", inplace = True)

## Dealing with NaN

Filling in the nan values with the most common column occurence/mean for the age.

In [31]:
test.dropna(inplace=True)
test.isna().any()

age                False
workclass          False
education_level    False
education-num      False
marital-status     False
occupation         False
relationship       False
race               False
sex                False
capital-gain       False
capital-loss       False
hours-per-week     False
native-country     False
dtype: bool

In [184]:
for x in test.columns:
    if test[x].isna().any():
        common = test[x].mode()[0]
        print (x, test[x].isna().any(), common)
        test[x].fillna(value=common, inplace = True)

education_level True  HS-grad
education-num True 9.0
marital-status True  Married-civ-spouse
occupation True  Craft-repair
relationship True  Husband
race True  White
sex True  Male
capital-gain True 0.0
capital-loss True 0.0
hours-per-week True 40.0
native-country True  United-States


In [32]:
test.isna().any()

age                False
workclass          False
education_level    False
education-num      False
marital-status     False
occupation         False
relationship       False
race               False
sex                False
capital-gain       False
capital-loss       False
hours-per-week     False
native-country     False
dtype: bool

### Splitting features from output

In [33]:
income_raw = data['income']
features_raw = data.drop('income', axis=1)

### Separate columns according to transformations to apply

In [34]:
# numerical
num_cols = ['age', 'education-num', 'capital-gain',
            'capital-loss', 'hours-per-week']
numerical = ['age', 'education-num', 'capital-gain',
             'capital-loss', 'hours-per-week']

# categorical
cat_cols = ['workclass', 'education_level', 
            'marital-status', 'occupation', 
            'relationship', 'race', 
            'sex', 'native-country']

# need log transform
log_transform_cols = ['capital-loss', 'capital-gain']

### Apply log transforms

In [35]:
skewed = ['capital-gain', 'capital-loss']
features_log_transformed = pd.DataFrame(data = features_raw)
features_log_transformed[skewed] = features_raw[skewed].apply(lambda x: np.log(x + 1))

In [36]:
test_log_transformed = pd.DataFrame(data=test)
test_log_transformed[log_transform_cols] = test[log_transform_cols].apply(lambda x: np.log(x + 1))

## Normalizing Numerical Features

In [37]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

features_log_minmax_transform = pd.DataFrame(data = features_log_transformed)
features_log_minmax_transform[numerical] = scaler.fit_transform(features_log_transformed[numerical])


In [38]:
test_log_minmax_transform = pd.DataFrame(data = test_log_transformed)
test_log_minmax_transform[numerical] = scaler.fit_transform(test_log_transformed[numerical])


## Data preprocessing

One Hot encoding.

In [39]:
features_final = pd.get_dummies(features_log_minmax_transform)
income = income_raw.apply(lambda n: 1 if n == '>50K' else 0)
test_final = pd.get_dummies(test_log_minmax_transform)

Shuffle and split data!

In [40]:
from sklearn.cross_validation import train_test_split

# Split the 'features' and 'income' data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_final, 
                                                    income, 
                                                    test_size = 0.2, 
                                                    random_state = 0)

In [41]:
print("Training set has {} samples.".format(X_train.shape))
print("Testing set has {} samples.".format(X_test.shape))

Training set has (36177, 103) samples.
Testing set has (9045, 103) samples.


## Implementation

Import accuracy score, and ADA Boost model. The parameters have been tuned beforehand using gridsearchcv.

In [42]:
from sklearn.metrics import fbeta_score, accuracy_score
from sklearn.ensemble import AdaBoostClassifier

clf = AdaBoostClassifier(n_estimators=1000, learning_rate=1)
clf = clf.fit(X_train, y_train)

testpredictions = clf.predict(X_test)
score = accuracy_score(testpredictions, y_test)
print(score)

0.867551133222775


In [19]:
test_final = test_final.drop('workclass_Private', axis=1)

KeyError: "labels ['workclass_Private'] not contained in axis"

In [43]:
final_predictions = clf.predict(test_final)

In [44]:
submission = pd.DataFrame(data = final_predictions)

In [45]:
submission.to_csv("submit.csv", index = True)

In [46]:
d = pd.read_csv("submit.csv")
d.head()

Unnamed: 0.1,Unnamed: 0,0
0,0,0
1,1,1
2,2,0
3,3,1
4,4,0


In [47]:
est = pd.read_csv("test_census.csv")

In [51]:
est['id'] = est.iloc[:,0] 

In [49]:
est['income'] = final_predictions

ValueError: Length of values does not match length of index

In [50]:
est[['id', 'income']].to_csv("submission.csv", index=False)

KeyError: "['income'] not in index"