##### <p> Samuel Wolfe <br> July 26, 2023 <br> MSBA 206 <br> DMBA Case 21.4 Part 2</p>

In [558]:
%matplotlib inline
from pathlib import Path
import pandas as pd
import requests
import io
import matplotlib.pylab as plt
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.naive_bayes import MultinomialNB, CategoricalNB
from sklearn.metrics import accuracy_score
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error
from dmba import regressionSummary, classificationSummary, liftChart, gainsChart

In [559]:
def readFile(url):
    download = requests.get(url).content
    # Reading the downloaded content and turning it into a pandas dataframe
    df = pd.read_csv(io.StringIO(download.decode('utf-8')))
    return df
def statslist(df):
    dfStats = pd.DataFrame({'Mean' : df.mean(numeric_only=True),
            'SD' : df.std(numeric_only=True),
            'Min' : df.min(),
            'Max' : df.max(),
            'Median' : df.median(numeric_only=True),
            })
    return dfStats
def categorize(df):
    for x in df:
        df[x] = df[x].astype('category')
    return df

#### Bringing in the predictors from Part 1

In [560]:
predictors = ['PARTY_D', 'HH_ND', 'GENDER_F', 'COMM_PT', 'VPP_08', 'VPR_08', 'H_F1', 'PARTY_R', 'VPP_12']

#### Importing dataset

In [561]:
dfVote = readFile("https://raw.githubusercontent.com/wolfesamk/MSBA-206/main/dmba/Voter-Persuasion.csv")
 #do not need these as they are duplicates of values or states as not useable
dropCol = ['VOTER_ID','SET_NO','Partition','MOVED_AD','opposite']
dfVoteDum = pd.get_dummies(dfVote.drop(columns=dropCol))
dfVoteDum = dfVoteDum[predictors]
outcome = 'MOVED_A'
classes = [0,1]

#### Binning COMM_PT

In [562]:
dfVoteDum['COMM_PT'] = pd.qcut(dfVoteDum['COMM_PT'], q=4, labels=[0,1,2,3],  precision=0)

#### Converting all categories into category

In [563]:
dfVoteDum = categorize(dfVoteDum)

#### Adding Partition and MOVED_A back to dfVoteDum

In [564]:
dfVoteDum['Partition'] = dfVote['Partition']
dfVoteDum['MOVED_A'] = dfVote['MOVED_A']

## 21.4.3

#### Split into partitions

In [565]:
scaler = preprocessing.StandardScaler()
scaler.fit(dfVoteDum[predictors])
X_dfVoteDum = scaler.transform(dfVoteDum.drop(columns=outcome).drop(columns='Partition'))
Y_dfVoteDum = dfVoteDum[outcome]

dfVote_T = dfVoteDum[dfVoteDum.Partition == 'T'].drop(columns='Partition')
X_train = dfVote_T.drop(columns=outcome)
Y_train = dfVote_T[outcome].to_frame()

dfVote_V = dfVoteDum[dfVoteDum.Partition == 'V'].drop(columns='Partition')
X_valid = dfVote_V.drop(columns=outcome)
Y_valid = dfVote_V[outcome].to_frame()

#### Normalizing the data

In [566]:
#scaler = preprocessing.StandardScaler()
scaler = MinMaxScaler()
scaler.fit(X_train)
# Transform the predictors of training, validation
X_train_norm = scaler.transform(X_train)
X_valid_norm = scaler.transform(X_valid)

#### Running full dataset knn ranges

In [567]:
results = []
for k in range(1,40,2):
    kfold = KFold(n_splits=10,random_state=1,shuffle=True)
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(estimator=knn,
                             X=X_dfVoteDum, y=Y_dfVoteDum, cv=kfold)
    results.append({
        'k' : k,
        'mean accuracy' : f'{scores.mean():.2%}' ,
        'standard deviation' : f'{scores.std():.2%}'
    })
results = pd.DataFrame(results).sort_values(by=['mean accuracy'],ascending=False)
print(results)

     k mean accuracy standard deviation
19  39        70.06%              1.80%
12  25        70.00%              1.80%
17  35        69.98%              1.46%
18  37        69.96%              1.59%
10  21        69.89%              1.52%
16  33        69.85%              1.55%
11  23        69.84%              1.82%
13  27        69.77%              1.52%
15  31        69.76%              1.68%
14  29        69.66%              1.71%
6   13        69.59%              2.01%
7   15        69.50%              1.55%
9   19        69.49%              1.59%
8   17        69.36%              1.48%
4    9        68.48%              1.82%
5   11        68.32%              2.04%
3    7        68.15%              2.09%
2    5        67.14%              1.38%
1    3        66.58%              1.54%
0    1        66.19%              2.08%


#### Selecting knn == 13

In [568]:
knn13 = KNeighborsClassifier(n_neighbors=13).fit(X_train_norm, Y_train.MOVED_A)
print('KNN == 13')
classificationSummary(Y_valid, knn13.predict(X_valid_norm))

KNN == 13
Confusion Matrix (Accuracy 0.6898)

       Prediction
Actual    0    1
     0 1859  712
     1  545  936


## Naive Bayes

## Preparing data for Naive Bayes
#### verifying dtypes

In [569]:
dfVoteDum.dtypes

PARTY_D      category
HH_ND        category
GENDER_F     category
COMM_PT      category
VPP_08       category
VPR_08       category
H_F1         category
PARTY_R      category
VPP_12       category
Partition      object
MOVED_A         int64
dtype: object

#### claiming new predictors

In [570]:
predictors = dfVoteDum.drop(columns=['Partition','MOVED_A']).columns.to_list()

#### Splitting the data by T and V

In [571]:
scaler = preprocessing.StandardScaler()
scaler.fit(dfVoteDum[predictors])
#X_dfVoteDum = scaler.transform(dfVoteDum.drop(columns=outcome).drop(columns='Partition'))
X_dfVoteDum = dfVoteDum.drop(columns=outcome).drop(columns='Partition')

Y_dfVoteDum = dfVoteDum[outcome]

dfVote_T = dfVoteDum[dfVoteDum.Partition == 'T'].drop(columns='Partition')
X_train = dfVote_T.drop(columns=outcome)
Y_train = dfVote_T[outcome].to_frame()

dfVote_V = dfVoteDum[dfVoteDum.Partition == 'V'].drop(columns='Partition')
X_valid = dfVote_V.drop(columns=outcome)
Y_valid = dfVote_V[outcome].to_frame()

#### Running Naive Bayes

In [572]:
vote_nb = MultinomialNB(alpha=1)
vote_nb.fit(X_train, Y_train.MOVED_A)

# predict probabilities
predProb_train = vote_nb.predict_proba(X_train)
predProb_valid = vote_nb.predict_proba(X_valid)

# predict class membership
y_train_pred = vote_nb.predict(X_train)
y_valid_pred = vote_nb.predict(X_valid)

#### Checking probability of all predictors vs outcome

In [573]:
pd.set_option('display.precision', 4)
# probability of flight status
print(dfVote_T[outcome].value_counts() / len(dfVote_T))
print()

for predictor in predictors:
    # construct the frequency table
    df = dfVote_T[[outcome, predictor]]
    freqTable = df.pivot_table(index=outcome, columns=predictor, aggfunc=len)

    # divide each row by the sum of the row to get conditional probabilities
    propTable = freqTable.apply(lambda x: x / sum(x), axis=1)
    print(propTable)
    print()
pd.reset_option('display.precision')

MOVED_A
0    0.6212
1    0.3788
Name: count, dtype: float64

PARTY_D       0       1
MOVED_A                
0        0.6620  0.3380
1        0.3076  0.6924

HH_ND         0       1       2       3       4       5       6       7  \
MOVED_A                                                                   
0        0.4858  0.2512  0.1670  0.0631  0.0208  0.0057  0.0038  0.0008   
1        0.1860  0.3316  0.2676  0.1287  0.0581  0.0129  0.0067  0.0018   

HH_ND         8       9  
MOVED_A                  
0        0.0005  0.0014  
1        0.0013  0.0053  

GENDER_F       0       1
MOVED_A                 
0         0.4853  0.5147
1         0.3249  0.6751

COMM_PT       0       1       2       3
MOVED_A                                
0        0.4357  0.1602  0.2382  0.1659
1        0.3395  0.1429  0.2366  0.2810

VPP_08        0       1
MOVED_A                
0        0.8135  0.1865
1        0.7270  0.2730

VPR_08        0       1
MOVED_A                
0        0.8701  0.1299
1    

In [574]:
print('Test Data Set')
classificationSummary(Y_train, y_train_pred, class_names=classes) 
print()
print('Validation Data Set')
classificationSummary(Y_valid, y_valid_pred, class_names=classes) 

Test Data Set
Confusion Matrix (Accuracy 0.6742)

       Prediction
Actual    0    1
     0 2398 1297
     1  641 1612

Validation Data Set
Confusion Matrix (Accuracy 0.6693)

       Prediction
Actual    0    1
     0 1640  931
     1  409 1072


## 24.1.4
#### Between the KNN model and the Naive Bayes method, the most accurate was KNN, by an improvement of 2.05%. I specifically chose KNN 13 because of the cross-validation.