# Tabular Playground Series - Oct 2021
## Data Description

This datase is from a kaggle competition. We will be predicting a binary target based on a number of feature columns given in the data. The columns are a mix of scaled continuous features and binary features.

The data is synthetically generated by a GAN that was trained on real-world molecular response data.

In [1]:
import kaggle 
import numpy as np
import matplotlib.pyplot as plt 
import pandas as pd
import seaborn as sns
import sklearn
import os



In [2]:
#!kaggle competitions download -c tabular-playground-series-oct-2021

In [3]:
os.listdir()

['.DS_Store',
 'Binary_classification.ipynb',
 'tabular-playground-series-oct-2021.zip',
 '.gitignore',
 '.ipynb_checkpoints',
 '.git',
 'data']

In [4]:
import zipfile 

with zipfile.ZipFile('tabular-playground-series-oct-2021.zip' , 'r') as file :
    file.extractall('data')

In [5]:
os.listdir('./data/')

['test.csv', 'train.csv', 'sample_submission.csv']

In [6]:
train_df = pd.read_csv('./data/train.csv', index_col='id')
test_inputs= pd.read_csv('./data/test.csv', index_col='id')
sample_df = pd.read_csv('./data/sample_submission.csv')

In [7]:
train_df.head()

Unnamed: 0_level_0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f276,f277,f278,f279,f280,f281,f282,f283,f284,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.205979,0.410993,0.176775,0.223581,0.423543,0.47614,0.41359,0.612021,0.534873,0.147295,...,0,1,0,0,0,0,0,0,0,1
1,0.181004,0.473119,0.011734,0.213657,0.619678,0.441593,0.230407,0.686013,0.281971,0.238509,...,0,1,0,0,0,0,0,0,0,1
2,0.182583,0.307431,0.32595,0.207116,0.605699,0.309695,0.493337,0.751107,0.536272,0.286813,...,0,0,0,1,1,0,0,0,0,1
3,0.18024,0.494592,0.008367,0.22358,0.760618,0.439211,0.432055,0.776147,0.483958,0.260886,...,0,0,0,0,1,0,0,0,0,1
4,0.177172,0.495513,0.014263,0.548819,0.625396,0.562493,0.117158,0.561255,0.077115,0.158321,...,0,1,1,0,1,0,0,1,0,1


In [8]:
train_df.shape , test_inputs.shape

((1000000, 286), (500000, 285))

In [9]:
train_df.target.value_counts()

1    500485
0    499515
Name: target, dtype: int64

**This dataset has approx. 50% values of both the classes**. So, the base model which gives all the targets as either of the classes, has the accuracy 50%.

In [10]:
train_df.describe()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f276,f277,f278,f279,f280,f281,f282,f283,f284,target
count,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,...,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0
mean,0.214334,0.460218,0.129253,0.277598,0.58071,0.416619,0.386532,0.654858,0.462256,0.258031,...,0.250096,0.137164,0.144793,0.130667,0.13921,0.199331,0.156065,0.183741,0.15468,0.500485
std,0.05332,0.101316,0.120805,0.063163,0.115338,0.058231,0.133457,0.065158,0.129439,0.119081,...,0.433068,0.344021,0.351892,0.337036,0.346166,0.399498,0.362917,0.387273,0.3616,0.5
min,0.04179,0.022016,0.000381,0.0,0.0,0.000959,0.000592,0.017994,0.00099,0.051183,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.181676,0.389215,0.017692,0.235342,0.497938,0.37439,0.317815,0.615372,0.363753,0.164559,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.204498,0.453893,0.095496,0.264669,0.565059,0.414009,0.42026,0.648562,0.475701,0.227714,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,0.229684,0.526023,0.177717,0.305837,0.657024,0.45836,0.47714,0.692666,0.561372,0.300988,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,1.0,0.959019,0.994818,0.979797,1.0,0.907889,0.972601,0.986195,0.986118,0.980994,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [11]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000000 entries, 0 to 999999
Columns: 286 entries, f0 to target
dtypes: float64(240), int64(46)
memory usage: 2.1 GB


**So, we don't need to apply scaling,encoding or imputing either.**

# Train/Validation Dataset

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score

In [13]:
inputs = train_df.drop('target', axis=1 )
targets= train_df.target

In [14]:
train_inputs , val_inputs , train_targets , val_targets = train_test_split(inputs , targets,
                                                                          test_size=0.25)
test_inputs

Unnamed: 0_level_0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f275,f276,f277,f278,f279,f280,f281,f282,f283,f284
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000000,0.178216,0.435617,0.010230,0.202074,0.390170,0.324221,0.221722,0.738894,0.582588,0.343770,...,1,0,0,0,0,0,1,1,1,0
1000001,0.181250,0.476455,0.022413,0.283146,0.598020,0.349508,0.283467,0.721575,0.268990,0.208373,...,0,0,0,0,0,0,0,0,0,0
1000002,0.159721,0.451202,0.259649,0.365274,0.594634,0.413502,0.249318,0.642339,0.411104,0.246891,...,0,0,0,0,0,0,1,0,0,0
1000003,0.182424,0.520976,0.095344,0.327742,0.741830,0.358711,0.270077,0.601662,0.297742,0.252829,...,0,0,0,0,0,1,1,0,0,0
1000004,0.229329,0.336513,0.023511,0.300913,0.668738,0.481586,0.545660,0.667849,0.546045,0.202731,...,0,0,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1499995,0.185473,0.525338,0.014945,0.204029,0.498957,0.431933,0.470553,0.731268,0.452134,0.211206,...,1,0,0,0,0,0,1,0,0,0
1499996,0.183194,0.553266,0.008055,0.280651,0.636591,0.478092,0.450386,0.582647,0.517270,0.225116,...,1,0,0,0,0,0,0,1,0,0
1499997,0.184650,0.533643,0.011218,0.201262,0.720698,0.472888,0.427469,0.589259,0.243879,0.413022,...,1,1,0,0,0,0,0,0,0,0
1499998,0.227731,0.513247,0.178603,0.313778,0.665656,0.401365,0.183369,0.708233,0.575135,0.152771,...,0,0,0,0,0,0,1,0,0,0


# Model Selection for classification
### There are many models out there (60+) but we will be using the most common and effecient models, preferred for binary classification.
We'll be using the following models first and then fine-tuning the best one later  
1. Logistic Regression
2. SVM , linearSVM
3. KNN
4. GaussianNB
5. Perceptron
6. SGDClassifier
7. DecisiontreeClassifier 
8. RandomForestClassifier

In [15]:
#let's do the imports first
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

In [16]:
#logestic Regression

logreg = LogisticRegression(solver='saga' , n_jobs=4 , random_state=1)
logreg.fit(train_inputs, train_targets)
Y_pred = logreg.predict(test_inputs)
acc_log = round(logreg.score(val_inputs, val_targets) * 100, 2)
print(acc_log)

76.07


In [17]:
# Gaussian Naive Bayes

gaussian = GaussianNB()
gaussian.fit(train_inputs, train_targets)
Y_pred = gaussian.predict(test_inputs)
acc_gaussian = round(gaussian.score(val_inputs, val_targets) * 100, 2)
print(acc_gaussian)


69.88


In [18]:
# Perceptron

perceptron = Perceptron(random_state=1)
perceptron.fit(train_inputs, train_targets)
Y_pred = perceptron.predict(test_inputs)
acc_perceptron = round(perceptron.score(val_inputs, val_targets) * 100, 2)
print(acc_perceptron)

62.11


In [19]:
# Stochastic Gradient Descent

sgd = SGDClassifier()
sgd.fit(train_inputs, train_targets)
Y_pred = sgd.predict(test_inputs)
acc_sgd = round(sgd.score(val_inputs, val_targets) * 100, 2)
print(acc_sgd)


75.55


In [None]:
# Decision Tree

decision_tree = DecisionTreeClassifier()
decision_tree.fit(train_inputs, train_targets)
Y_pred = decision_tree.predict(test_inputs)
acc_decision_tree = round(decision_tree.score(val_inputs, val_targets) * 100, 2)
print(acc_decision_tree)


In [None]:

# Random Forest

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(train_inputs, train_targets)
Y_pred = random_forest.predict(test_inputs)
random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(val_inputs, val_targets) * 100, 2)
print(acc_random_forest)