In [13]:
import numpy as np
import pandas as pd
import pandas_profiling

from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder

In [2]:
import random
import os
seed = 42
random.seed(seed)
np.random.seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)

In [55]:
def _classification(x, y, result:list):
    x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=1 ,stratify=y)
    
    knn = KNeighborsClassifier()
    knn.fit(x_train, y_train)
    res = knn.predict(x_test)
    result.append({'method': 'KNN Classifier', 'score': accuracy_score(res, y_test)})
    
    lda = LinearDiscriminantAnalysis()
    lda.fit(x_train, y_train)
    res = lda.predict(x_test)
    result.append({'method': 'LDA', 'score': accuracy_score(res, y_test)})
    
    qda = QuadraticDiscriminantAnalysis()
    qda.fit(x_train, y_train)
    res = qda.predict(x_test)
    result.append({'method': 'QDA', 'score': accuracy_score(res, y_test)})
    
    log = LogisticRegression()
    log.fit(x_train, y_train)
    res = log.predict(x_test)
    result.append({'method': 'Logisitic Regression Classifier', 'score': accuracy_score(res, y_test)})
    
    dt = DecisionTreeClassifier()
    dt.fit(x_train, y_train)
    res = dt.predict(x_test)
    result.append({'method': 'DT Classifier', 'score': accuracy_score(res, y_test)})
    
    rf = RandomForestClassifier()
    rf.fit(x_train, y_train)
    res = rf.predict(x_test)
    result.append({'method': 'RF Classifier', 'score': accuracy_score(res, y_test)})


In [65]:
le = LabelEncoder()
def _transform(x, columns):
    for column in columns:
        x[column] = le.fit_transform(x[column])
        test[column] = le.transform(test[column])


In [83]:
data = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
y = data['Churn']
x = data.drop(['Churn', 'Unnamed: 0', 'customerID'], axis=1)
test = test.drop(['Unnamed: 0', 'customerID'], axis=1)
print(x.shape)
y = y.drop(x[x.TotalCharges == ' '].index)
x = x.drop(x[x.TotalCharges == ' '].index)
print(x.shape)

(4788, 20)
(4778, 20)


In [84]:
columns = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']
_transform(x, columns)

In [78]:
sc = StandardScaler()
x = sc.fit_transform(x, y)
test = sc.transform(test)

In [79]:
result = []
_classification(x, y, result)
print(pd.DataFrame(result))

                            method     score
0                   KNN Classifier  0.774059
1                              LDA  0.797071
2                              QDA  0.771967
3  Logisitic Regression Classifier  0.802301
4                    DT Classifier  0.726987
5                    RF Classifier  0.804393


In [85]:
submit = pd.read_csv('sample_submit.csv')
lda = LogisticRegression(solver='newton-cg')
lda.fit(x, y)
res = pd.DataFrame(lda.predict(test), columns=['Churn'])
res.loc[res.Churn == 'No'] = 0
res.loc[res.Churn == 'Yes'] = 1
submit['Churn'] = res['Churn']
submit.to_csv('submit.csv', index=None)