In [1]:
import pandas as pd
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import pandas_profiling
import random
import os
seed = 42
random.seed(seed)
np.random.seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)

In [2]:
def label_encode(train:pd.DataFrame, test:pd.DataFrame, columns:list):
    le = LabelEncoder()
    
    for column in columns:
        labels = pd.concat([train[column], test[column]]) .drop_duplicates()
        le.fit(labels)
        train[column] = le.transform(train[column])
        test[column] = le.transform(test[column])

In [31]:
def _classification(x, y, result:list):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1, stratify=y)
    
    for model in [KNeighborsClassifier(), KNeighborsClassifier(weights='distance'), LogisticRegression(solver='newton-cg'), LogisticRegression(), LinearDiscriminantAnalysis(), DecisionTreeClassifier(), RandomForestClassifier()]:
        model.fit(x_train, y_train)
        res = model.predict(x_test)
        result.append({'model': str(model), 'score': accuracy_score(res, y_test)})

In [82]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [83]:
drop_columns = ['customerID', 'index', 'Unnamed: 0']
train = train.drop(drop_columns, axis=1)
test = test.drop(drop_columns, axis=1)

In [84]:
label_encode(train, test, columns=['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod'])

In [85]:
train = train.drop(train[train.TotalCharges ==  ' '].index)

In [86]:
y = train['Churn']
train = train.drop('Churn', axis=1)

In [89]:
y = y.replace('No', 0)
y = y.replace('Yes', 1)
y

0       0
1       0
2       1
3       0
4       0
       ..
4783    1
4784    0
4785    0
4786    1
4787    0
Name: Churn, Length: 4778, dtype: int64

In [90]:
result = []
_classification(train, y, result)
print(pd.DataFrame(result))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


                                      model     score
0                    KNeighborsClassifier()  0.763598
1  KNeighborsClassifier(weights='distance')  0.759414
2    LogisticRegression(solver='newton-cg')  0.803347
3                      LogisticRegression()  0.802301
4              LinearDiscriminantAnalysis()  0.797071
5                  DecisionTreeClassifier()  0.755230
6                  RandomForestClassifier()  0.802301


In [91]:
model = LogisticRegression(solver='newton-cg')
model.fit(train, y)
res = model.predict(test)
submit = pd.read_csv('sample_submit.csv')
submit['Churn'] = res

In [92]:
submit.to_csv('submit.csv', index=False)