In [1]:
#Import library
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler  
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

In [2]:
#Read data
train=pd.read_csv('../input/netflix-appetency/train.csv')
test=pd.read_csv('../input/netflix-appetency/test.csv')
sample=pd.read_csv('../input/netflix-appetency/sample_submission.csv')

In [3]:
train.head()

Unnamed: 0,id,target,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,...,feature_497,feature_498,feature_499,feature_500,feature_501,feature_502,feature_503,feature_504,feature_505,feature_506
0,0,0,C0,C0,C1,C5,C11,37.56,54.756667,54.756667,...,0,0,0,0,0,0,0,0,0,0
1,1,0,C0,C0,C3,C5,C1,,,,...,0,0,0,0,0,0,0,0,0,0
2,2,0,C0,C0,C3,C5,C2,,,,...,0,0,0,0,0,0,0,0,0,0
3,3,0,C0,C0,C1,C5,C1,,,,...,0,0,0,0,0,0,0,0,0,0
4,4,1,C0,C0,C3,C3,C11,37.48,37.48,37.161333,...,0,0,0,0,0,0,0,0,0,0


In [4]:
#create train, target, and test
target=train["target"]
train=train.drop(["id","target"],axis=1)
test=test.drop(["id"],axis=1)

In [5]:
train.shape, test.shape

((70000, 507), (30000, 507))

In [6]:
#Get columns with more than 50% missing values
Missing_features=train.loc[:,(train.isnull().sum()/train.shape[0])>0.5]
#Drop them from train and test 
train=train.drop(Missing_features.columns,axis=1)
test=test.drop(Missing_features.columns,axis=1)

In [7]:
#Numeric features name
Numeric_Features=train.select_dtypes(exclude=['object']).columns[2:] #skip id,target

In [8]:
#Fillna numeric features  with mean
train[Numeric_Features]=train[Numeric_Features].fillna(train[Numeric_Features].mean())
test[Numeric_Features]=test[Numeric_Features].fillna(test[Numeric_Features].mean())

In [9]:
#check NA in train
print(train.loc[:,(train.isnull().sum())>0].shape)
#check NA in test
print(test.loc[:,(test.isnull().sum())>0].shape)

(70000, 7)
(30000, 7)


In [10]:
datetime_Features=train.loc[:,(train.isnull().sum()/train.shape[0])>0]
#Drop from train/test
train.drop(datetime_Features.columns,axis=1,inplace=True)
test.drop(datetime_Features.columns,axis=1,inplace=True)

In [11]:
#Drop column contains one unique value
one_unique=train.loc[:,train.nunique()==1]
train.drop(one_unique.columns,axis=1,inplace=True)
test.drop(one_unique.columns,axis=1,inplace=True)

In [12]:
#Categorical features name
Categorical_Features=train.select_dtypes(include=['object']).columns
for index in Categorical_Features:
    label_encoder = LabelEncoder()
    train[index]=label_encoder.fit_transform(train[index])
    test[index]=label_encoder.fit_transform(test[index])

In [13]:
train.shape, test.shape

((70000, 419), (30000, 419))

In [15]:
#rescale train and test
scaler = StandardScaler() 
scaler.fit(train) 
train = scaler.transform(train)
test = scaler.transform(test)

In [16]:
#train a logistic classifier 
Logistclf = LogisticRegression(penalty="l2",max_iter=1000).fit(train, target)
#predict test data with logistic classifier
Logistpredict = Logistclf.predict_proba(test)

#Read the sample_submission
sample['target']=Logistpredict[:,1]
sample.to_csv('./Logist.csv',index=False)
sample.head(10)

Unnamed: 0,id,target
0,5,0.355489
1,7,0.285036
2,9,0.23138
3,11,0.113602
4,14,0.173737
5,15,0.185951
6,19,0.161749
7,21,0.081033
8,25,0.254415
9,29,0.135476


In [17]:
Logistclf.coef_.shape

(1, 419)

In [18]:
#neural network from sklearn for embeddings
mlp = MLPClassifier(solver='adam',activation= 'logistic', random_state=1,max_iter=2000, warm_start=True)
parameter_space = {
    'hidden_layer_sizes': [(12,),(10,), (8,), (6,),(4,),(2,)],
    'alpha': [0.3,0.4,0.5,0.6,0.7,0.8],
}

In [20]:
clf = GridSearchCV(mlp, parameter_space, n_jobs=-1, cv=3)
clf.fit(train, target)

GridSearchCV(cv=3,
             estimator=MLPClassifier(activation='logistic', max_iter=2000,
                                     random_state=1, warm_start=True),
             n_jobs=-1,
             param_grid={'alpha': [0.3, 0.4, 0.5, 0.6, 0.7, 0.8],
                         'hidden_layer_sizes': [(12,), (10,), (8,), (6,), (4,),
                                                (2,)]})

In [21]:
print (clf.cv_results_['params'])
print(clf.cv_results_['mean_test_score'])

[{'alpha': 0.3, 'hidden_layer_sizes': (12,)}, {'alpha': 0.3, 'hidden_layer_sizes': (10,)}, {'alpha': 0.3, 'hidden_layer_sizes': (8,)}, {'alpha': 0.3, 'hidden_layer_sizes': (6,)}, {'alpha': 0.3, 'hidden_layer_sizes': (4,)}, {'alpha': 0.3, 'hidden_layer_sizes': (2,)}, {'alpha': 0.4, 'hidden_layer_sizes': (12,)}, {'alpha': 0.4, 'hidden_layer_sizes': (10,)}, {'alpha': 0.4, 'hidden_layer_sizes': (8,)}, {'alpha': 0.4, 'hidden_layer_sizes': (6,)}, {'alpha': 0.4, 'hidden_layer_sizes': (4,)}, {'alpha': 0.4, 'hidden_layer_sizes': (2,)}, {'alpha': 0.5, 'hidden_layer_sizes': (12,)}, {'alpha': 0.5, 'hidden_layer_sizes': (10,)}, {'alpha': 0.5, 'hidden_layer_sizes': (8,)}, {'alpha': 0.5, 'hidden_layer_sizes': (6,)}, {'alpha': 0.5, 'hidden_layer_sizes': (4,)}, {'alpha': 0.5, 'hidden_layer_sizes': (2,)}, {'alpha': 0.6, 'hidden_layer_sizes': (12,)}, {'alpha': 0.6, 'hidden_layer_sizes': (10,)}, {'alpha': 0.6, 'hidden_layer_sizes': (8,)}, {'alpha': 0.6, 'hidden_layer_sizes': (6,)}, {'alpha': 0.6, 'hidden_

In [22]:
sklearn_nn_predict=clf.predict_proba(test)
#Read the sample_submission
sample['target']=sklearn_nn_predict[:,1]
sample.to_csv('./sklearn_nn.csv',index=False)
sample.head(10)

Unnamed: 0,id,target
0,5,0.409265
1,7,0.28918
2,9,0.181864
3,11,0.113919
4,14,0.156719
5,15,0.28455
6,19,0.187014
7,21,0.110776
8,25,0.229921
9,29,0.150691
