In [139]:
import pandas as pd
import imblearn
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor 
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

from sklearn.model_selection import cross_val_score

from scipy.stats import t, norm
import numpy as np

In [32]:
data = pd.read_csv('Customer-Churn.csv')

In [33]:
def encode_string(data_col):
    dif_obj = data_col.unique()
    new_list = []
    for x in data_col:
        for obj in range(0,len(dif_obj)):
            if(x == dif_obj[obj]):
                new_list.append(obj)
    return new_list

CLEAN

In [34]:
data['gender'] = encode_string(data['gender'])
data['Partner'] = encode_string(data['Partner'])
data['Dependents'] = encode_string(data['Dependents'])
data['PhoneService'] = encode_string(data['PhoneService'])
data['OnlineSecurity'] = encode_string(data['OnlineSecurity'])
data['OnlineBackup'] = encode_string(data['OnlineBackup'])
data['DeviceProtection'] = encode_string(data['DeviceProtection'])
data['TechSupport'] = encode_string(data['TechSupport'])
data['StreamingTV'] = encode_string(data['StreamingTV'])
data['StreamingMovies'] = encode_string(data['StreamingMovies'])
data['Contract'] = encode_string(data['Contract'])
data['Churn'] = encode_string(data['Churn'])

unwanted_rows = data['TotalCharges'][data['TotalCharges'] == ' ']
data = data.drop(unwanted_rows.index, axis = 0).reset_index()
data['TotalCharges'] = data['TotalCharges'].apply(lambda x: float(x))

SPLITTING

In [86]:
y = data['Churn']
X = data.drop(['Churn'], axis = 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

-SMOTE

In [87]:
oversample = SMOTE()
X_train_smote, y_train_smote = oversample.fit_resample(X_train, y_train)

-LINEAR REG

In [88]:
model = LinearRegression().fit(X_train_smote, y_train_smote)
model.score(X_test, y_test)

0.04737632199376829

-DECISSION TREE

In [89]:
model = DecisionTreeRegressor().fit(X_train_smote, y_train_smote)
model.score(X_test, y_test)

-0.49993953644608635

*TOMEK LINK

In [94]:
oversample = TomekLinks()
X_train_tomek, y_train_tomek = oversample.fit_resample(X_train, y_train)

*LINEAR REG

In [95]:
model = LinearRegression().fit(X_train_tomek, y_train_tomek)
model.score(X_test, y_test)

0.22343626240590897

*DECISSION TREE

In [96]:
model = DecisionTreeRegressor().fit(X_train_tomek, y_train_tomek)
model.score(X_test, y_test)

-0.5619449109842121

In [128]:
import pandas as pd 
import numpy as np 
pd.set_option('display.max_columns', None) 
import warnings 
warnings.filterwarnings('ignore')

churnData = pd.read_csv('Customer-Churn.csv') 
churnData['TotalCharges'] = pd.to_numeric(churnData['TotalCharges'], errors='coerce') 
churnData['TotalCharges'] = churnData['TotalCharges'].fillna(np.mean(churnData['TotalCharges']))

from sklearn.preprocessing import StandardScaler 
from imblearn.over_sampling import SMOTE 

smote = SMOTE() 
X = churnData[['tenure', 'SeniorCitizen','MonthlyCharges', 'TotalCharges']] 
transformer = StandardScaler().fit(X) 
X = transformer.transform(X) 
y = churnData['Churn'] 
X_sm, y_sm = smote.fit_sample(X, y)
y_sm.value_counts()

Yes    5174
No     5174
Name: Churn, dtype: int64

In [129]:
pd.DataFrame(X)

Unnamed: 0,0,1,2,3
0,-1.277445,-0.439916,-1.160323,-0.994971
1,0.066327,-0.439916,-0.259629,-0.173876
2,-1.236724,-0.439916,-0.362660,-0.960399
3,0.514251,-0.439916,-0.746535,-0.195400
4,-1.236724,-0.439916,0.197365,-0.941193
...,...,...,...,...
7038,-0.340876,-0.439916,0.665992,-0.129281
7039,1.613701,-0.439916,1.277533,2.242808
7040,-0.870241,-0.439916,-1.168632,-0.855182
7041,-1.155283,2.273159,0.320338,-0.872777


In [133]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state = 1000)

In [140]:
model1 = DecisionTreeClassifier()
model2 = SGDClassifier()

In [141]:
model_pipeline = [model1, model2]
model_names = ['Regression Tree', 'Linear Regression']


def confidence_intervals(model_pipeline, model_names, X_train, y_train, alpha = 0.05, K = 10):
# We set the significance level
#alpha = 0.05
#K = 10
    scores = {}
    i=0
    for model in model_pipeline:
        mean_score = np.mean(cross_val_score(model, X_train, y_train, cv=K))
        if (K < 30):
            # t.ppf(area) gives us the critical value corresponding to the area for the t-student distribution.
            t_critical = abs(t.ppf(1-alpha/2, K-1)) 
            interval = t_critical*(np.std(cross_val_score(model, X_train, y_train, cv=K))/np.sqrt(K))
        else:
            # norm.ppf(area) gives us the critical value corresponding to the area for the normal distribution
            z_critical = abs(norm.ppf(1-alpha/2)) 
            interval = z_critical*(np.std(cross_val_score(model, X_train, y_train, cv=K))/np.sqrt(K))
        scores[model_names[i]] = [mean_score, mean_score - interval, mean_score + interval]
        print("The rmse of the {} model is (CV witk K={}) = {:4.2f} +/- {:4.2f}".format(model_names[i], K, mean_score, interval))
        i = i+1

confidence_intervals(model_pipeline, model_names, X_train, y_train, 0.05, 5)

The rmse of the Regression Tree model is (CV witk K=5) = 0.73 +/- 0.01
The rmse of the Linear Regression model is (CV witk K=5) = 0.76 +/- 0.03
