In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
import category_encoders as ce
from tqdm import tnrange, tqdm_notebook
import itertools
import time
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
plt.style.use('ggplot')

In [4]:
class predictor():
    def _load_data(self):
        self.data = pd.read_csv('CaseCompetitionData2021.csv')
        del self.data['post_purchase_satisfaction']
        pd.options.display.max_columns= None
    
    def __init__(self):
        return self._load_data()
    
    def results_binary(self):
        self.data.loc[self.data['subsequent_purchases'] > 0, ['subsequent_purchases']] = 1
        
    def clean_data(self,value):
        self.value = value
        check = self.data.isin([self.value])
        df_obj = check.any()
        listofIs =[]
        for col in list(df_obj[df_obj == True].index):
            rows = list(res[col][res[col] == True].index)
            for row in rows:
                listofIs.append((row, col))
        Na_values = listofIs
        na_values = [Na_values[i][0] for i in range(len(Na_values))]
        self.data = self.data.drop(index=na_values)
        return self.data
    
    def convert_bins(self,dfcolumn):
        self.dfcolumn = dfcolumn
        column = list(self.data[self.dfcolumn])
        columnlist= []
        for i in range(len(column)):
            if '-' in column[i]:
                columnlist.append(column[i].split('-'))
            else:
                columnlist.append(column[i].split('+'))
        for i in range(len(columnlist)):
            for j in range(len(columnlist[i])):
                try:
                    columnlist[i][j]=float(columnlist[i][j])
                except:
                    columnlist[i]=[columnlist[i][0]]
        means = []
        for i in range(len(columnlist)):
            if len(columnlist[i]) == 2:
                means.append(sum(columnlist[i])/2)
            else:
                means.append(columnlist[i][0])
        return means
    
    def _converter(self):
        self.data['purchase_means'] = self.convert_bins('purchase_price')
        self.data['age_means']=self.convert_bins('customer_age')
        self.data['customer_income_mean']=self.convert_bins('customer_income')
        self.data = self.data.drop(['purchase_price','customer_age','customer_income'],axis=1)
        return
    
    def _info(self):
        return self.data.info()
    
    def categoricals(self):
        self.cat_df = self.data.select_dtypes(include=['object']).copy()
        encoder = ce.BinaryEncoder(cols=['purchase_make','purchase_model','customer_gender'])
        df_binary=encoder.fit_transform(self.cat_df)
        self.data = self.data.drop(['purchase_make','purchase_model','customer_gender','customer_distance_to_dealer'],axis=1)
        self.result = pd.concat([self.data,df_binary],axis=1)
        self.result = self.result.drop(['insert_num','purchase_vehicle_year'],axis=1)
        return 
    
    
    def knn_classifier(self):
        from sklearn.neighbors import KNeighborsClassifier
        from sklearn.model_selection import train_test_split
        from sklearn import metrics
        from sklearn.metrics import confusion_matrix
        from sklearn.metrics import classification_report
        features = self.result.loc[:,self.result.columns != 'subsequent_purchases']
        X = features
        y = self.result['subsequent_purchases']
        X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=0)       
        knn = KNeighborsClassifier(n_neighbors=12)
        knn.fit(X_train,y_train)
        y_pred = knn.predict(X_test)
        confusion_matrix(y_test,y_pred)
        pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)
        #print(classification_report(y_test,y_pred))
        #print()
        print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
    
    def naive_bayes(self):
        from sklearn.model_selection import train_test_split
        from sklearn.naive_bayes import GaussianNB
        from sklearn.metrics import accuracy_score
        features = self.result.loc[:,self.result.columns != 'subsequent_purchases']
        X = features
        y = self.result['subsequent_purchases']
        X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=0)
        model = GaussianNB()
        model.fit(X_train,y_train)
        y_pred = model.predict(X_test)
        print("Accuracy:",accuracy_score(y_test,y_pred)*100)
        
        
    def random_forest(self):
        from sklearn.ensemble import RandomForestClassifier
        from sklearn import metrics
        features = self.result.loc[:,self.result.columns != 'subsequent_purchases']
        X = features
        y = self.result['subsequent_purchases']
        X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=0)
        #Create a Gaussian Classifier
        clf=RandomForestClassifier(n_estimators=128)
        #Train the model using the training sets y_pred=clf.predict(X_test)
        clf.fit(X_train,y_train)
        y_pred=clf.predict(X_test)
        print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
        
    def logistic(self):
        from sklearn.linear_model import LogisticRegression
        from sklearn import metrics
        features = self.result.loc[:,self.result.columns != 'subsequent_purchases']
        X = features
        y = self.result['subsequent_purchases']
        X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=0)
        logreg = LogisticRegression()
        # fit the model with data
        logreg.fit(X_train,y_train)
        y_pred=logreg.predict(X_test)
        print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
        
    def fit_linear_reg(self,X,Y):
        self.X = X
        self.Y = Y 
        model_k = linear_model.LinearRegression(fit_intercept = True)
        model_k.fit(X,Y)
        RSS = mean_squared_error(Y,model_k.predict(X)) * len(Y)
        R_squared = model_k.score(X,Y)
        return RSS, R_squared
    
    def income_subset(self):
        self.result = self.result[self.result['customer_income_mean'] > 100000]
        return 
    
    
    
test = predictor()

test.clean_data('?')
test._converter()
test.categoricals()
test.logistic()

  elif pd.api.types.is_categorical(cols):


Accuracy: 0.6760408195388127


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
