In [1]:
import pandas as pd
import numpy as np
from sklearn import *
from sklearn.preprocessing import LabelEncoder
import collections
from sklearn.metrics import accuracy_score

In [2]:
columns_name = ['age','workclass','fnlwgt','education','education-num','marital-status',
               'occupation','relationship','race','sex','capital-gain','capital-loss',
               'hours-per-week','native-country']
df = pd.read_csv('train-features.csv',names=columns_name,header=None,na_values=' ?')
df['income'] = pd.read_csv('train-output.csv') #Binary (0 means <=50K, 1 means >50K)

## Classification

In [6]:
class Classification:
    def __init__(self):
        
        self.model = None
        self.columns_name = ['age','workclass','fnlwgt','education','education-num',
                                       'marital-status','occupation','relationship','race','sex',
                                       'capital-gain','capital-loss','hours-per-week','native-country']
        self.train_data = pd.read_csv('train-features.csv',names=self.columns_name,header=None,na_values=' ?')
        self.train_data['income'] = pd.read_csv('train-output.csv') #Binary (0 means <=50K, 1 means >50K)
        self.train_data = self.train_data.dropna()
        self.test_data = pd.read_csv('test-features.csv',names=self.columns_name,header=None,na_values=' ?')
        self.test_pred = None
        
        self.x_train,self.x_test, self.y_train, self.y_test = model_selection.train_test_split(self.train_data[self.columns_name],self.train_data['income'],test_size=0.2,random_state=2)
    def preprocess(self,df):
        '''
        Cleans df and performs feature engineering.
        '''
        # replace nan values with random values of it's column's value
        for i in self.columns_name:
            df[i] = df[i].fillna(np.random.choice(df[i].dropna().unique()))

        # category
        df['education'] = df['education'].str.replace('Preschool', 'dropout')
        df['education'] = df['education'].str.replace('10th', 'dropout')
        df['education'] = df['education'].str.replace('11th', 'dropout')
        df['education'] = df['education'].str.replace('12th', 'dropout')
        df['education'] = df['education'].str.replace('1st-4th', 'dropout')
        df['education'] = df['education'].str.replace('5th-6th', 'dropout')
        df['education'] = df['education'].str.replace('7th-8th', 'dropout')
        df['education'] = df['education'].str.replace('9th', 'dropout')
        df['education'] = df['education'].str.replace('HS-Grad', 'HighGrad')
        df['education'] = df['education'].str.replace('HS-grad', 'HighGrad')
        df['education'] = df['education'].str.replace('Some-college', 'CommunityCollege')
        df['education'] = df['education'].str.replace('Assoc-acdm', 'CommunityCollege')
        df['education'] = df['education'].str.replace('Assoc-voc', 'CommunityCollege')
        
        # Binning
        
        df.loc[df['education-num']  < 9,'education-num']   = 0 # dropout
        df.loc[df['education-num'] == 9,'education-num']  = 1 # high school
        df.loc[df['education-num'] == 10,'education-num'] = 2 # Community College
        df.loc[df['education-num'] == 11,'education-num'] = 2 # Community College
        df.loc[df['education-num'] == 12,'education-num'] = 2 # Community College
        df.loc[df['education-num'] == 13,'education-num'] = 3 # Bachelor
        df.loc[df['education-num'] == 14,'education-num'] = 4 # Master
        df.loc[df['education-num'] == 15,'education-num'] = 5 # Prof-school
        df.loc[df['education-num'] == 16,'education-num'] = 6 # Doctorate
        
        # binary
        df.loc[df['capital-gain'] >= 2000,'capital-gain'] = 1
        df.loc[df['capital-gain']  < 2000,'capital-gain'] = 0
        
        df.loc[df['capital-loss']  <= 600,'capital-loss'] = 1
        df.loc[df['capital-loss']  > 600, 'capital-loss'] = 0
        
        return df
    def model(self,model):
        '''
        Takes in model.
        '''
        self.model = model
    def plain_training(self,test_df):
        '''
        It trains model once and returns accuracy score
        '''
        clf = self.model()
        clf.fit(self.x_train,self.y_train)
        y_pred = clf.predict(test_df[self.columns_name])
        
        return clf.score(self.x_test,self.y_test)
    def generate_submission(self, y_pred):
        '''
        Saves submission in the right format.
        '''
        pred_df = pd.DataFrame()
        pred_df['Id'] = np.arange(0,len(test_df[self.columns_name]))
        pred_df['Category'] = y_pred
        pred_df.to_csv('submission.csv',index=False)
        
    def encode_df(self,df):
        '''
        Encode dataframe into numbers.
        '''
        return df.apply(LabelEncoder().fit_transform)

    def randomforest_cw(self):
        
        '''
        Cross-validation training for Random Forest method.
        '''
        
                        

In [7]:
clsf = Classification()
df = clsf.train_data
df = clsf.preprocess(df)

In [11]:
test_df = clsf.test_data
test_df = clsf.preprocess(test_df)

In [35]:
encoded_df = clsf.encode_df(df)
encoded_test_df = clsf.encode_df(test_df)
# encoded_df

In [36]:
# Random forest model
randomforest = ensemble.RandomForestClassifier(max_depth=3, random_state=0)
randomforest.fit(x_train,y_train)
y_pred = randomforest.predict(encoded_test_df[test_column])
pred_df = pd.DataFrame()
pred_df['Id'] = np.arange(0,len(encoded_test_df[test_column]))
pred_df['Category'] = y_pred
pred_df.to_csv('submission.csv',index=False)
randomforest.score(x_test,y_test)




0.7566716393170894

In [37]:
from sklearn.svm import SVC
clf = SVC(gamma='auto')
clf.fit(x_train,y_train)
y_pred = clf.predict(encoded_test_df[test_column])
pred_df = pd.DataFrame()
pred_df['Id'] = np.arange(0,len(encoded_test_df[test_column]))
pred_df['Category'] = y_pred
pred_df.to_csv('submission.csv',',')
clf.score(x_test,y_test)

0.7543510691198408

In [26]:
reg = linear_model.Lasso()
reg.fit(x_train,y_train)
y_pred = reg.predict(encoded_test_df[test_column])
# print(y_test)
# y_pred = model_selection.cross_val_predict(reg,x_train,cv=3)
# print(y_pred.shape,y_train.shape)
# accuracy_score(y_test,y_pred)
print(y_pred)

[0.23929348 0.24142571 0.23850362 ... 0.23836012 0.24147718 0.2400234 ]


In [28]:
# Naive Bayes
NB = naive_bayes.GaussianNB()
NB.fit(x_train,y_train)
y_pred = NB.predict(x_test)
accuracy_score(y_test,y_pred)

0.7558428642466435