In [268]:
#Import library
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler  
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

In [310]:
#Read data
train=pd.read_csv('../input/netflix-appetency/train.csv')
test=pd.read_csv('../input/netflix-appetency/test.csv')
sample=pd.read_csv('../input/netflix-appetency/sample_submission.csv')

In [311]:
train.head()

Unnamed: 0,id,target,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,...,feature_497,feature_498,feature_499,feature_500,feature_501,feature_502,feature_503,feature_504,feature_505,feature_506
0,0,0,C0,C0,C1,C5,C11,37.56,54.756667,54.756667,...,0,0,0,0,0,0,0,0,0,0
1,1,0,C0,C0,C3,C5,C1,,,,...,0,0,0,0,0,0,0,0,0,0
2,2,0,C0,C0,C3,C5,C2,,,,...,0,0,0,0,0,0,0,0,0,0
3,3,0,C0,C0,C1,C5,C1,,,,...,0,0,0,0,0,0,0,0,0,0
4,4,1,C0,C0,C3,C3,C11,37.48,37.48,37.161333,...,0,0,0,0,0,0,0,0,0,0


In [312]:
#create train, target, and test
target=train["target"]
train=train.drop(["id","target"],axis=1)
test=test.drop(["id"],axis=1)

In [295]:
train.shape, test.shape

((70000, 507), (30000, 507))

In [313]:
#Get columns with more than 50% missing values
Missing_features=train.loc[:,(train.isnull().sum()/train.shape[0])>0.5]
#Drop them from train and test 
train=train.drop(Missing_features.columns,axis=1)
test=test.drop(Missing_features.columns,axis=1)

In [314]:
#Numeric features name
Numeric_Features=train.select_dtypes(exclude=['object']).columns[2:] #skip id,target

In [315]:
#Fillna numeric features  with mean
train[Numeric_Features]=train[Numeric_Features].fillna(train[Numeric_Features].mean())
test[Numeric_Features]=test[Numeric_Features].fillna(test[Numeric_Features].mean())

In [316]:
#check NA in train
print(train.loc[:,(train.isnull().sum())>0].shape)
#check NA in test
print(test.loc[:,(test.isnull().sum())>0].shape)

(70000, 7)
(30000, 7)


In [317]:
datetime_Features=train.loc[:,(train.isnull().sum()/train.shape[0])>0]
#Drop from train/test
train.drop(datetime_Features.columns,axis=1,inplace=True)
test.drop(datetime_Features.columns,axis=1,inplace=True)

In [318]:
#Drop column contains one unique value
one_unique=train.loc[:,train.nunique()==1]
train.drop(one_unique.columns,axis=1,inplace=True)
test.drop(one_unique.columns,axis=1,inplace=True)

In [319]:
#Categorical features name
Categorical_Features=train.select_dtypes(include=['object']).columns
for index in Categorical_Features:
    label_encoder = LabelEncoder()
    train[index]=label_encoder.fit_transform(train[index])
    test[index]=label_encoder.fit_transform(test[index])

In [320]:
train.shape, test.shape

((70000, 419), (30000, 419))

In [309]:
train[1,:]

array([-5.52462160e-02, -6.08222741e-02,  4.58618243e-01,  5.26791563e-01,
       -8.28910750e-01, -1.01484852e-01, -1.31627316e-01,  1.39212657e+00,
       -2.10092996e-01, -1.25037732e+00,  1.53988247e-01, -6.71315490e-01,
       -1.61027808e-01, -1.61161988e-01, -1.61390626e-01,  5.81420252e-01,
       -4.77669021e-01, -4.93719745e-01, -7.90925497e-01, -5.42897611e-01,
       -7.98390193e-01, -6.94589157e-01, -5.83992765e-01, -4.19236454e-01,
       -5.70091863e-01, -1.30717492e-01, -5.41644600e-03, -6.98425951e-03,
       -4.84663831e-03, -5.73791960e-03, -5.20468640e-03, -1.63795532e-02,
       -1.32179568e-01, -2.14789508e-01,  1.45418048e+00, -1.29982447e-01,
       -3.59670324e-01, -2.56290747e-01, -9.52217909e-02,  1.29522941e+00,
       -4.94169936e-02, -4.16120723e-02, -4.13615433e-01, -2.76513767e-02,
        5.27625978e-01,  1.26633500e-01, -4.47528535e-02, -4.57734695e-02,
        8.18948233e-01, -2.24300095e-02,  9.95349694e-02, -4.87673882e-02,
       -2.31664770e-02, -

In [321]:
#rescale train and test
scaler = StandardScaler() 
scaler.fit(train) 
train = scaler.transform(train)
test = scaler.transform(test)

In [324]:
#train a logistic classifier 
Logistclf = LogisticRegression(penalty="l2",max_iter=1000).fit(train, target)
#predict test data with logistic classifier
Logistpredict = Logistclf.predict_proba(test)

#Read the sample_submission
sample['target']=Logistpredict[:,1]
sample.to_csv('./Logist.csv',index=False)
sample.head(10)

Unnamed: 0,id,target
0,5,0.355489
1,7,0.285036
2,9,0.23138
3,11,0.113602
4,14,0.173737
5,15,0.185951
6,19,0.161749
7,21,0.081033
8,25,0.254415
9,29,0.135476


In [325]:
Logistclf.coef_.shape

(1, 419)

In [326]:
#neural network from sklearn for embeddings
mlp = MLPClassifier(solver='adam',activation= 'logistic', random_state=1,max_iter=2000, warm_start=True)
parameter_space = {
    'hidden_layer_sizes': [(12,),(10,), (8,), (6,),(4,),(2,)],
    'alpha': [0.3,0.4,0.5,0.6,0.7,0.8],
}

In [None]:
clf = GridSearchCV(mlp, parameter_space, n_jobs=-1, cv=3)
clf.fit(train, y)

In [252]:
print (clf.cv_results_['params'])
print(clf.cv_results_['mean_test_score'])

[{'alpha': 0.5, 'hidden_layer_sizes': (12,)}, {'alpha': 0.5, 'hidden_layer_sizes': (10,)}, {'alpha': 0.5, 'hidden_layer_sizes': (8,)}, {'alpha': 0.5, 'hidden_layer_sizes': (6,)}, {'alpha': 0.5, 'hidden_layer_sizes': (4,)}, {'alpha': 0.5, 'hidden_layer_sizes': (2,)}, {'alpha': 1, 'hidden_layer_sizes': (12,)}, {'alpha': 1, 'hidden_layer_sizes': (10,)}, {'alpha': 1, 'hidden_layer_sizes': (8,)}, {'alpha': 1, 'hidden_layer_sizes': (6,)}, {'alpha': 1, 'hidden_layer_sizes': (4,)}, {'alpha': 1, 'hidden_layer_sizes': (2,)}, {'alpha': 1.5, 'hidden_layer_sizes': (12,)}, {'alpha': 1.5, 'hidden_layer_sizes': (10,)}, {'alpha': 1.5, 'hidden_layer_sizes': (8,)}, {'alpha': 1.5, 'hidden_layer_sizes': (6,)}, {'alpha': 1.5, 'hidden_layer_sizes': (4,)}, {'alpha': 1.5, 'hidden_layer_sizes': (2,)}, {'alpha': 2, 'hidden_layer_sizes': (12,)}, {'alpha': 2, 'hidden_layer_sizes': (10,)}, {'alpha': 2, 'hidden_layer_sizes': (8,)}, {'alpha': 2, 'hidden_layer_sizes': (6,)}, {'alpha': 2, 'hidden_layer_sizes': (4,)}, {

In [253]:
clf.cv_results_['params'][12:17]

[{'alpha': 1.5, 'hidden_layer_sizes': (12,)},
 {'alpha': 1.5, 'hidden_layer_sizes': (10,)},
 {'alpha': 1.5, 'hidden_layer_sizes': (8,)},
 {'alpha': 1.5, 'hidden_layer_sizes': (6,)},
 {'alpha': 1.5, 'hidden_layer_sizes': (4,)}]

In [254]:
clf.cv_results_['params'][0:6]

[{'alpha': 0.5, 'hidden_layer_sizes': (12,)},
 {'alpha': 0.5, 'hidden_layer_sizes': (10,)},
 {'alpha': 0.5, 'hidden_layer_sizes': (8,)},
 {'alpha': 0.5, 'hidden_layer_sizes': (6,)},
 {'alpha': 0.5, 'hidden_layer_sizes': (4,)},
 {'alpha': 0.5, 'hidden_layer_sizes': (2,)}]

In [235]:
sklearn_nn_predict=clf.predict_proba(test)
#Read the sample_submission
sample['target']=sklearn_nn_predict[:,1]
sample.to_csv('./sklearn_nn.csv',index=False)
sample.head(10)

Unnamed: 0,id,target
0,5,0.405091
1,7,0.302033
2,9,0.212932
3,11,0.116148
4,14,0.178395
5,15,0.267613
6,19,0.164515
7,21,0.109501
8,25,0.252417
9,29,0.139366
