In [1]:
# To support python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common Imports
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# To plot pretty figures
import matplotlib as mpl
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# To make output stable across all runs
np.random.seed(42)
    
# Ignore useless warning
import warnings
warnings.filterwarnings(action='ignore', message="^internal gelsd")

In [2]:
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')

In [3]:
#train.head()

In [4]:
#train.shape

In [5]:
#train.info()

In [6]:
#train.describe()

In [7]:
#train.hist(bins=50, figsize=(15, 8))

The data is more scaled to the right than to the left

In [8]:
#train.isna().sum()


In [9]:
#Checking for correlation

#corr_matrix = train.corr()
#corr_matrix['CHURN'].sort_values(ascending=False)

# Preparing Data For Machine Learning Algorithm

In [10]:
from sklearn import preprocessing

# Concatinating train and test for easy featuring
ntrain = train.shape[0]
ntest = test.shape[0]

# Get data target variable
lab_enc = preprocessing.LabelEncoder()
y = lab_enc.fit_transform(train['CHURN'])

all_data = pd.concat((train, test)).reset_index(drop=True)

# Drop Target variable
all_data.drop(['CHURN', 'user_id'], axis=1, inplace=True)

print("Total data size is: {}".format(all_data.shape))

Total data size is: (500000, 17)


In [11]:
# Dealing with missing values with scikit-learn
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')

# Since median only compute numerical attributes,create
# a copy of the data without text attibute(ocean_proximity)
all_data_cat = [column for column in all_data.columns if all_data[column].dtypes == 'O']
for i in range(len(all_data_cat)):
    all_data_num = all_data.drop(all_data_cat, axis=1)
    
# fit the imputer instance to the training data set
imputer.fit(all_data_num)

SimpleImputer(strategy='median')

In [12]:
imputer.statistics_

array([3.00e+03, 7.00e+00, 3.00e+03, 1.00e+03, 9.00e+00, 2.67e+02,
       2.70e+01, 2.90e+01, 6.00e+00, 1.00e+00, 2.00e+00, 2.40e+01,
       5.00e+00])

In [13]:
all_data_num.median().values

array([3.00e+03, 7.00e+00, 3.00e+03, 1.00e+03, 9.00e+00, 2.67e+02,
       2.70e+01, 2.90e+01, 6.00e+00, 1.00e+00, 2.00e+00, 2.40e+01,
       5.00e+00])

In [14]:
# Use the trained imputer to transform the training set by replacing missing values with the learned median
X = imputer.transform(all_data_num)

In [15]:
# Convert back to Pandas Dataframe
all_data_tr = pd.DataFrame(X, columns=all_data_num.columns)

In [16]:
# Handling Text and Categorical Attributes
missing_cat = [var for var in all_data.columns if all_data[var].dtypes=='O' and all_data[var].isnull().mean()>0]
missing_cat

['REGION', 'TOP_PACK']

In [17]:
for i in missing_cat:
    all_data[i].fillna(all_data[i].mode()[0], inplace=True)

In [18]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
all_data_cat_encoded = encoder.fit_transform(all_data_cat)
all_data_cat_encoded

array([1, 2, 0, 3], dtype=int64)

In [19]:
print(encoder.classes_)

['MRG' 'REGION' 'TENURE' 'TOP_PACK']


In [20]:
from sklearn.preprocessing import LabelBinarizer, OneHotEncoder, StandardScaler

cat_encoder = OneHotEncoder(sparse=False)
housing_cat_1hot = cat_encoder.fit_transform(all_data_cat_encoded.reshape(-1, 1))
housing_cat_1hot

array([[0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.]])

In [21]:
from sklearn.base import BaseEstimator, TransformerMixin

# Create a class to select numerical or categorical columns 
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [22]:
from sklearn.pipeline import Pipeline

num_attribs = list(all_data_num)
cat_attribs = all_data_cat

num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_attribs)),
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(cat_attribs)),
        ('cat_encoder', OneHotEncoder(sparse=False)),
    ])

In [23]:
from sklearn.pipeline import FeatureUnion

full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])

In [24]:
all_data_prepared = full_pipeline.fit_transform(all_data)
all_data_prepared

array([[ 2.11200854,  2.02228428,  2.23556143, ...,  0.        ,
         0.        ,  0.        ],
       [-0.05840052,  1.74738358, -0.03982932, ...,  0.        ,
         0.        ,  0.        ],
       [-0.5369159 , -0.63508916, -0.5305144 , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.48564639, -0.26855489, -0.48038976, ...,  0.        ,
         0.        ,  0.        ],
       [-0.36601755, -0.54345559, -0.36035883, ...,  0.        ,
         0.        ,  0.        ],
       [-0.28056837, -0.26855489, -0.27905297, ...,  0.        ,
         0.        ,  0.        ]])

In [25]:
train = all_data_prepared[:ntrain]
test = all_data_prepared[ntrain:]

print('Train shape: {}-------Test shape: {}'.format(train.shape, test.shape))

Train shape: (400000, 148)-------Test shape: (100000, 148)


# Select and Train a Model

In [26]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=0.2, random_state=42)

In [27]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn import svm
from sklearn.neural_network import MLPClassifier

#knn_clf = KNeighborsClassifier()
#knn_clf.fit(X_train, y_train)

#forest_clf = RandomForestClassifier(random_state=42)
#forest_clf.fit(X_train, y_train)

#clf = svm.SVC()
#clf.fit(X_train, y_train)

#tree_clf = DecisionTreeClassifier(random_state=42)
#tree_clf.fit(X_train, y_train)

#sdg_clf = SGDClassifier(random_state=42)
#sdg_clf.fit(X_train, y_train)

mlp = MLPClassifier(max_iter=1000, hidden_layer_sizes=(100, 50), alpha=0.0001, solver='adam', random_state=42)
mlp.fit(X_train, y_train)

MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=42)

In [28]:
#knn_clf.score(X_test, y_test)
#forest_clf.score(X_test, y_test)
#clf.score(X_test, y_test)
#tree_clf.score(X_test, y_test)
#sdg_clf.score(X_test, y_test)\
mlp.score(X_test, y_test)

0.8661875

In [33]:
some_data = train[:20]
#knn_clf.predict_proba(some_data)
#forest_clf.predict_proba(some_data)
#clf.predict_proba(some_data)
y_pred = mlp.predict_proba(some_data)[:, 0]
y_pred

array([0.99999998, 0.92156302, 0.99965462, 0.99978295, 0.99990709,
       0.98211291, 0.60967193, 0.99748294, 1.        , 0.18376534,
       0.90157679, 0.99138329, 0.99825383, 0.99994302, 0.95697776,
       1.        , 0.23972061, 0.99728301, 0.99999562, 0.21393092])

In [37]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test[:20], y_pred)
rmse = np.sqrt(mse)
rmse

0.826699453576314

In [48]:
# Loading submission dataset
submission = pd.read_csv('sample_submission.csv')
submission.head()

Unnamed: 0,user_id,CHURN
0,af900d87e73b7ff6509d2203df4704a98aa5f2a6,0
1,5335efd940280b82143272275637d1e65d37eadb,0
2,a581f4fa08677c26f83f643248c667e241043086,0
3,64f67177d0775262b8087a9e2e3b8061b6324ae6,0
4,0d6009a4594c4be22449b8d9cc01a0bcea98faea,0


In [49]:
y_pred1.shape

(100000,)

In [50]:
submission['CHURN'] = y_pred1[0]

In [51]:
submission.to_csv('sixth_submission.csv', index=False)

In [52]:
third_sub = pd.read_csv('sixth_submission.csv')
third_sub.head()

Unnamed: 0,user_id,CHURN
0,af900d87e73b7ff6509d2203df4704a98aa5f2a6,0.765723
1,5335efd940280b82143272275637d1e65d37eadb,0.765723
2,a581f4fa08677c26f83f643248c667e241043086,0.765723
3,64f67177d0775262b8087a9e2e3b8061b6324ae6,0.765723
4,0d6009a4594c4be22449b8d9cc01a0bcea98faea,0.765723
