# Respondent Classification

## Author: Sumit Kutty

In [224]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 300)

from warnings import filterwarnings as w
w('ignore')


from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import cross_val_score, KFold, RandomizedSearchCV, train_test_split
from sklearn.feature_selection import SelectKBest,chi2

#ML
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report


### Load Data

In [225]:
dataset = pd.read_csv("SpendData.csv")
data = dataset.copy()
test = pd.read_csv("TestData.csv")
target = data['pov6']

### EDA and Data Cleaning

* Checking for null values

In [226]:
data.isnull().sum()

Unnamed: 0        0
month             0
var8           4009
var6          16480
a.1               0
              ...  
c.281             0
c.282             0
c.283             0
f.284          4418
t.158         18379
Length: 301, dtype: int64

#### Handle null values and reconciling the features of train and test set

In [227]:
def equalize(data, test):
    #TRAIN SET
    for i in data.columns:
        if i not in test.columns:
            data.drop(i, axis = 1, inplace = True)
    for i in test.columns:
        if i not in data.columns:
            test.drop(i, axis = 1, inplace =True)
        
        
    # remove other redundant features
    data.drop(['Unnamed: 0','pov6'], axis = 1, inplace = True)
    test.drop(['Unnamed: 0','pov6'], axis = 1, inplace = True)
    
    
        
    red_cols = []
    train_nulls = []
    for i in data.columns:
        if data[i].isnull().sum() > 800:
            red_cols.append(i)
        elif data[i].isnull().sum() < 800 and data[i].isnull().sum() > 0 and i != 'pov6':
            train_nulls.append(i)
        
        

    # remove features with more than 800 null values in train set from both train and test set    
    data.drop(red_cols, axis = 1, inplace = True)
    test.drop(red_cols, axis = 1, inplace = True)
    
    
    
    #TEST SET
    test_nulls = []
    red_cols2 = []
    for i in test.columns:
        if test[i].isnull().sum() > 800:
            red_cols2.append(i)
        elif test[i].isnull().sum() < 800 and test[i].isnull().sum() > 0:
            test_nulls.append(i)
   
    

    # remove features with more than 800 null values in test set from both train and test set    
    data.drop(red_cols2, axis = 1, inplace = True)
    test.drop(red_cols2, axis = 1, inplace = True)
    
    
    
    
    for i in train_nulls: #train_nulls: missing values in train set
        if i not in test_nulls and i in test.columns:
            continue
        elif i not in test_nulls and i not in test.columns:
            train_nulls.remove(i)
        
    for i in test_nulls:
        if i not in train_nulls and i in data.columns:
            continue
        elif i not in train_nulls and i not in data.columns:
            test_nulls.remove(i)
   

    data.drop(['respondent.id','year'], axis = 1, inplace = True)
    test.drop(['respondent.id','year'], axis = 1, inplace = True)
        
        
    return train_nulls, test_nulls, data, test


##### The above function removes features with more than 800 null values. Retains the features with less than 800 null values. returns train nulls and test nulls

In [228]:
train_nulls, test_nulls, data, test = equalize(data,test)

In [229]:
print(len(data.columns))
print(len(test.columns))

239
239


In [None]:
for i in train_nulls:
    sns.distplot(data[i])
    plt.show()

* The features are roughly normally distributed. Either mean or median can be used to replace the null values

#### Imputing means: 
The means of train data features are used to impute features in both train data and test data to avoid data leakage

In [231]:
imputer = SimpleImputer(strategy = 'mean')
imputer2 = SimpleImputer(strategy = 'most_frequent')
if len(train_nulls) != 0:
    for i in train_nulls:
        if i not in cat_feats: 
            data[i] = imputer.fit_transform(np.array(data[i]).reshape(-1,1))
        else:
            data[i] = imputer2.fit_transform(np.array(data[i]).reshape(-1,1))

if len(test_nulls) != 0:        
    for i in test_nulls:
        if i not in cat_feats:
            imputer.fit(np.array(data[i]).reshape(-1,1))
            test[i] = imputer.transform(np.array(test[i]).reshape(-1,1))
        else:
            imputer2.fit(np.array(data[i]).reshape(-1,1))
            test[i] = imputer2.transform(np.array(test[i]).reshape(-1,1))

* At this stage, all columns of train and test are the same with no null values.
* There doesn't seem to be the need to any feature engineering apart from one hot encoding the categorical features.


In [232]:
data.describe()

* The standard deviation of the columns are pretty far away from each other. Standardization might be essential.

#### One hot encoding the categorical features

In [233]:
#Separate the categorical variables:
cat_feats  =['month', 'var9']

# One hot encode 'month' and 'var9'
data_dum = pd.get_dummies(data[cat_feats].astype('str'))
test_dum = pd.get_dummies(test[cat_feats].astype('str'))


data_scaled = data.drop(cat_feats, axis = 1)
test_scaled = test.drop(cat_feats, axis = 1)

#### Feature Scaling.

In [234]:
scaler = StandardScaler()

# Scale the variables
data_scaled = pd.DataFrame(scaler.fit_transform(data_scaled), columns = data_scaled.columns)
test_scaled = pd.DataFrame(scaler.transform(test_scaled), columns = test_scaled.columns)

# Concatenate the scaled numerical variables and the one-hot encoded categorical variables
data_scaled = pd.concat([data_scaled,data_dum], axis = 1)
test_scaled = pd.concat([test_scaled,test_dum], axis = 1)

### Over sampling using SMOTE

In [235]:
from imblearn.over_sampling import SMOTENC

smote  = SMOTENC(categorical_features = [0])

data_us, target_us = smote.fit_sample(data_scaled, target)

### PCA

In [None]:
from sklearn.decomposition import PCA as pca

n_pc = 30
pca = pca(n_components = n_pc)


colnames = ['PC' + str(i) for i in range(1, n_pc+1)]
data_pc = pd.DataFrame(pca.fit_transform(data_us), columns = colnames)
test_pc = pd.DataFrame(pca.transform(test_scaled), columns = colnames)

### Dataset Split


In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(data, target, test_size = 0.25)

### Model Building


In [None]:
lr = LogisticRegression(solver = 'saga')
rf = RandomForestClassifier(n_estimators = 500)
dt = DecisionTreeClassifier()
knn = KNeighborsClassifier()
kfold = KFold(n_splits = 5, shuffle = True)

print("ACCURACY:")
print("Logistic Regression: ", np.mean(cross_val_score(lr, data_pc, target_us, cv = kfold)))
print("Random Forest: ", np.mean(cross_val_score(rf, data_pc, target_us, cv = kfold)))
print("Decision Tree: ", np.mean(cross_val_score(dt, data_pc, target_us, cv = kfold)))
print("KNN: ", np.mean(cross_val_score(knn, data_pc, target_us, cv = kfold)))


### Hyper Parameter Tuning

In [None]:
rf_grid = RandomForestClassifier()
params = {'n_estimators':[100,200,300,400,500,600], 'criterion': ['gini','entropy'], 'max_depth':[20,30,40,50,60,80, 100,None], 
          'max_features': ['auto', 'sqrt', 'log2'], 'min_impurity_decrease':[0.0001,0.001,0.01,0.1], 'min_impurity_split':[0.0001,0.001,0.01,0.1]}
grid_model = RandomizedSearchCV(rf_grid, cv=5, param_distributions = params, n_iter = 20)
grid_model.fit(data, target)

 
print("Grid search score is: ", grid_model.best_score_)
print("Grid search estimator: ", grid_model.best_estimator_)

##### The tuned model is tested for prediction

In [None]:
grid_model.fit(xtrain,ytrain)
ypred2 = grid_model.predict(xtest)


labels = list(target.value_counts().index)
labels = [str(i) for i in labels]
print("Accuracy: ", accuracy_score(ytest,ypred2))
print()
print("Classification Report: ")
print(classification_report(ytest, ypred2, target_names=labels))

#### Predicting for the test set

In [None]:
predictions = grid_model.predict(test)

## THE END