In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier 

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
pd.set_option('display.max_columns', 1000)

In [None]:
train = pd.read_csv('data/train_data.csv')
test = pd.read_csv('data/test_data.csv')

In [None]:
print('The train data has {} rows and {} columns'.format(train.shape[0], train.shape[1]))
print('The test data has {} rows and {} columns'.format(test.shape[0], test.shape[1]))

In [None]:
## check target class
train['target'].value_counts(normalize=True)

There aren't any missing values. Let's jump to building models to get some baseline score.

In [None]:
## check missing values
train.isnull().sum(axis=0) ## there are no missing values.

### Model - 0 (Majority Class)

In [None]:
## lets make a submission with all 0s
sub = pd.read_csv('data/sample_submission.csv')
sub['target'] = 0
sub.to_csv('submissions/sub0.csv', index=False) ## 0.58

### Model 1 - XGB

In [None]:
#from sklearn.model_selection import train_test_split
#import xgboost as xgb

In [None]:
feature_names = [x for x in train.columns if x not in ['connection_id','target']]
target = train['target']

In [None]:
#X_train, X_valid, y_train, y_valid = train_test_split(train, target, train_size = 0.7, stratify = target, random_state = 2017)

In [None]:
## function for multi-accuracy
#from sklearn.metrics import accuracy_score
#def multAcc(pred, dtrain):
#    label = dtrain.get_label()
#    acc = accuracy_score(label, pred)
#return 'maccuracy', acc

In [None]:
# default parameters
#params = {'objective':'multi:softmax',
 #         'num_class':3,
          # 'eval_metric':'merror'
  #       }

In [None]:
#dtrain = xgb.DMatrix(data=X_train[feature_names], label=y_train)
#dvalid = xgb.DMatrix(data=X_valid[feature_names], label=y_valid)
#dtest = xgb.DMatrix(data=test[feature_names])
#watchlist = [(dtrain, 'train'),(dvalid, 'eval')]

In [None]:
#clf1 = xgb.train(params, dtrain, 1000, watchlist, maximize=True, verbose_eval=20, early_stopping_rounds=40, feval=multAcc)

In [None]:
#pred = clf1.predict(dtest)

In [None]:
## make submission
#sub = pd.read_csv('data/sample_submission.csv')
#sub['target'] = pred
#sub['target'] = sub['target'].astype(int)
#sub.to_csv('submissions/sub1.csv', index=False)

### Decision Tree

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier

In [None]:
## set up model
clf2 = DecisionTreeClassifier(max_depth=8, min_samples_split=7, max_features='sqrt')

In [None]:
## train model
clf2.fit(train[feature_names], target)

In [None]:
## make prediction
pred2 = clf2.predict(test[feature_names])

In [None]:
## make submission
sub = pd.read_csv('data/sample_submission.csv')
sub['target'] = pred2
sub['target'] = sub['target'].astype(int)
sub.to_csv('submissions/sub2.csv', index=False)

### KNN

In [None]:
print("Starting KNN")
knn = KNeighborsClassifier(n_neighbors=3,n_jobs=-1)

knn.fit(train[feature_names], target)

predictions = knn.predict(test[feature_names])

actual_targets = train.head(91166)['target'].astype(int).tolist()

predictions = predictions.astype(int)

print(predictions[:5])
print(actual_targets[:5])

cm = pd.DataFrame(confusion_matrix(actual_targets, predictions), columns=[0,1,2], index=[0,1,2])
print(cm)

print("Train Accuracy :: {}".format(accuracy_score(actual_targets, predictions)))

sub['target'] = predictions
sub['target'] = sub['target'].astype(int)
sub.to_csv('submissions/knn.csv', index=False)

### MLP

In [None]:
print("Starting MLP")

mlp = MLPClassifier(solver='lbfgs', alpha=1e-5,
                     hidden_layer_sizes=(1,3), random_state=1)

mlp.fit(train[feature_names], target)

predictions = mlp.predict(test[feature_names])

actual_targets = train.head(91166)['target'].astype(int).tolist()

predictions = predictions.astype(int)

print(predictions[:5])
print(actual_targets[:5])

cm = pd.DataFrame(confusion_matrix(actual_targets, predictions), columns=[0,1,2], index=[0,1,2])
print(cm)

print("Train Accuracy :: {}".format(accuracy_score(actual_targets, predictions)))

sub['target'] = predictions
sub['target'] = sub['target'].astype(int)
sub.to_csv('submissions/mlp.csv', index=False)

### RFR

In [None]:
print("Starting RFR")
rfr = RandomForestRegressor(n_estimators=50, oob_score=True, random_state=32,n_jobs = -1, min_samples_leaf=4)
rfr.fit(train[feature_names], target)
predictions = rfr.predict(test[feature_names])

actual_targets = train.head(91166)['target'].astype(int).tolist()

predictions = predictions.astype(int)

print(predictions[:5])
print(actual_targets[:5])

cm = pd.DataFrame(confusion_matrix(actual_targets, predictions), columns=[0,1,2], index=[0,1,2])
print(cm)

print("OOB Score :: {}".format(rfr.oob_score_))
print("Train Accuracy :: {}".format(accuracy_score(actual_targets, predictions)))

sub['target'] = predictions
sub['target'] = sub['target'].astype(int)
sub.to_csv('submissions/random_forest_regressor.csv', index=False)

### Random Forest

In [None]:
sample_leaf_options = [5]

def run_forest(leaf_size):
    clf = RandomForestClassifier(n_estimators = 100, 
                                 oob_score = True, 
                                 n_jobs = -1,
                                 random_state = 32,
                                 max_features = "auto", 
                                 min_samples_leaf = leaf_size)

    trained_model = clf.fit(train[feature_names], target)

    print("Trained model :: ", trained_model)

    predictions = trained_model.predict(test[feature_names])

    for i in range(0, 25):
        print("Actual outcome :: {} and Predicted outcome :: {}".format(list(target)[i], predictions[i]))

    actual_targets = train.head(91166)['target'].astype(int).tolist()

    print("OOB Score :: {}".format(clf.oob_score_))
    print("Train Accuracy :: {}".format(accuracy_score(actual_targets, predictions)))

    #pd.crosstab(actual_targets, predictions, rownames=['Actual Target'], colnames=['Predicted Target'])

    #print(" Confusion matrix ", confusion_matrix(actual_targets, predictions))

    #print(list(zip(train[feature_names], clf.feature_importances_)))
    
    cm = pd.DataFrame(confusion_matrix(actual_targets, predictions), columns=[0,1,2], index=[0,1,2])
    print(cm)
    
    sub['target'] = predictions
    sub['target'] = sub['target'].astype(int)
    sub.to_csv('submissions/random_forest.csv', index=False)
    return

print('nothing')
  
for leaf_size in sample_leaf_options:
    run_forest(leaf_size)