##### <p> Samuel Wolfe <br> July 26, 2023 <br> MSBA 206 <br> DMBA Case 21.4 Part 2</p>

In [1]:
%matplotlib inline
from pathlib import Path
import pandas as pd
import requests
import io
import matplotlib.pylab as plt
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error
from dmba import regressionSummary, classificationSummary, liftChart, gainsChart

In [2]:
def readFile(url):
    download = requests.get(url).content
    # Reading the downloaded content and turning it into a pandas dataframe
    df = pd.read_csv(io.StringIO(download.decode('utf-8')))
    return df
def statslist(df):
    dfStats = pd.DataFrame({'Mean' : df.mean(numeric_only=True),
            'SD' : df.std(numeric_only=True),
            'Min' : df.min(),
            'Max' : df.max(),
            'Median' : df.median(numeric_only=True),
            })
    return dfStats

#### Importing dataset

In [3]:
dfVote = readFile("https://raw.githubusercontent.com/wolfesamk/MSBA-206/main/dmba/Voter-Persuasion.csv")
#this second dataframe is to create different predictor groups
dfVoteCats = readFile("https://raw.githubusercontent.com/wolfesamk/MSBA-206/main/dmba/Voter-Persuasion-Cats.csv")
 #do not need these as they are duplicates of values or states as not useable
dropCol = ['VOTER_ID','SET_NO','Partition','MOVED_AD','opposite']
dfVoteDum = pd.get_dummies(dfVote.drop(columns=dropCol), drop_first=True, dtype='uint8')
predictorsAll = dfVoteDum.columns.tolist()
dfVoteDum['Partition'] = dfVote['Partition']
outcome = 'MOVED_A'
classes = [0,1]

#### Creating predictor groups

In [4]:
predictors_derived = dfVoteCats[dfVoteCats.types == 'Derived '].drop(columns='types').cats.to_list()
predictors_census = dfVoteCats[dfVoteCats.types == 'Census'].drop(columns='types').cats.to_list()
predictors_voterfile = dfVoteCats[dfVoteCats.types == 'Voterfile'].drop(columns='types').cats.to_list()

#### I have opted to first use all variables. If this fails I will split the predictors by type. Derived, Commercial Data, Voterfile, Census

## 21.4.3

#### Split into partitions

In [5]:
scaler = preprocessing.StandardScaler()
scaler.fit(dfVoteDum[predictorsAll].drop(columns='MOVED_A'))
X_dfVoteDum = scaler.transform(dfVoteDum.drop(columns=outcome).drop(columns='Partition'))
Y_dfVoteDum = dfVoteDum[outcome]

dfVote_T = dfVoteDum[dfVoteDum.Partition == 'T'].drop(columns='Partition')
X_train = dfVote_T.drop(columns=outcome)
Y_train = dfVote_T[outcome].to_frame()

dfVote_V = dfVoteDum[dfVoteDum.Partition == 'V'].drop(columns='Partition')
X_valid = dfVote_V.drop(columns=outcome)
Y_valid = dfVote_V[outcome].to_frame()

#### Normalizing the data

In [6]:
#scaler = preprocessing.StandardScaler()
scaler = MinMaxScaler()
scaler.fit(X_train)
# Transform the predictors of training, validation
X_train_norm = scaler.transform(X_train)
X_valid_norm = scaler.transform(X_valid)

#### Running full dataset knn ranges

In [7]:
results = []
for k in range(1,20,2):
    kfold = KFold(n_splits=10,random_state=1,shuffle=True)
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(estimator=knn,
                             X=X_dfVoteDum, y=Y_dfVoteDum, cv=kfold)
    results.append({
        'k' : k,
        'mean accuracy' : f'{scores.mean():.2%}' ,
        'standard deviation' : f'{scores.std():.2%}'
    })
results = pd.DataFrame(results).sort_values(by=['mean accuracy'],ascending=False)
print(results)

    k mean accuracy standard deviation
4   9        85.73%              1.04%
1   3        85.68%              1.13%
2   5        85.67%              1.13%
6  13        85.61%              1.21%
7  15        85.56%              1.16%
3   7        85.54%              1.12%
8  17        85.51%              1.23%
9  19        85.49%              1.24%
5  11        85.47%              1.23%
0   1        83.87%              0.94%


#### Selecting knn == 9

In [8]:
knn9 = KNeighborsClassifier(n_neighbors=9).fit(X_train_norm, Y_train.MOVED_A)
print('KNN == 9')
classificationSummary(Y_valid, knn9.predict(X_valid_norm))

KNN == 9
Confusion Matrix (Accuracy 0.9000)

       Prediction
Actual    0    1
     0 2371  200
     1  205 1276


## Naive Bayes

In [9]:
vote_nb = MultinomialNB(alpha=1)
vote_nb.fit(X_train_norm, Y_train.MOVED_A)

# predict probabilities
predProb_train = vote_nb.predict_proba(X_train_norm)
predProb_valid = vote_nb.predict_proba(X_valid_norm)

# predict class membership
y_train_pred = vote_nb.predict(X_train_norm)
y_valid_pred = vote_nb.predict(X_valid_norm)

In [10]:
print('Test Data Set')
classificationSummary(Y_train, y_train_pred, class_names=classes) 
print()
print('Validation Data Set')
classificationSummary(Y_valid, y_valid_pred, class_names=classes) 

Test Data Set
Confusion Matrix (Accuracy 0.8122)

       Prediction
Actual    0    1
     0 3120  575
     1  542 1711

Validation Data Set
Confusion Matrix (Accuracy 0.8164)

       Prediction
Actual    0    1
     0 2169  402
     1  342 1139


# Doing first predictors group
#### predictors_derived

#### Split into partitions

In [24]:
dropCol = ['VOTER_ID','SET_NO','Partition','MOVED_AD','opposite']
dfVoteDer = dfVote[predictors_derived]
dfVoteDum = pd.get_dummies(dfVoteDer, drop_first=True, dtype='uint8')
dfVoteDum['Partition'] = dfVote['Partition']
dfVoteDum['MOVED_A'] = dfVote['MOVED_A']
outcome = 'MOVED_A'
classes = [0,1]

In [25]:
scaler = preprocessing.StandardScaler()
scaler.fit(dfVoteDum[predictors_derived])
X_dfVoteDum = scaler.transform(dfVoteDum[predictors_derived])
Y_dfVoteDum = dfVoteDum[outcome]

dfVote_T = dfVoteDum[dfVoteDum.Partition == 'T'].drop(columns='Partition')
X_train = dfVote_T.drop(columns=outcome)
Y_train = dfVote_T[outcome].to_frame()

dfVote_V = dfVoteDum[dfVoteDum.Partition == 'V'].drop(columns='Partition')
X_valid = dfVote_V.drop(columns=outcome)
Y_valid = dfVote_V[outcome].to_frame()

#### Normalizing the data

In [26]:
#scaler = preprocessing.StandardScaler()
scaler = MinMaxScaler()
scaler.fit(X_train)
# Transform the predictors of training, validation
X_train_norm = scaler.transform(X_train)
X_valid_norm = scaler.transform(X_valid)

#### Running derived dataset knn ranges

In [27]:
results = []
for k in range(1,20,2):
    kfold = KFold(n_splits=10,random_state=1,shuffle=True)
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(estimator=knn,
                             X=X_dfVoteDum, y=Y_dfVoteDum, cv=kfold)
    results.append({
        'k' : k,
        'mean accuracy' : f'{scores.mean():.2%}' ,
        'standard deviation' : f'{scores.std():.2%}'
    })
results = pd.DataFrame(results).sort_values(by=['mean accuracy'],ascending=False)
print(results)

    k mean accuracy standard deviation
8  17        71.53%              2.08%
9  19        71.41%              1.72%
7  15        71.35%              2.05%
5  11        71.26%              2.09%
6  13        71.07%              1.85%
4   9        70.79%              1.97%
3   7        70.61%              1.61%
2   5        69.87%              1.70%
1   3        69.29%              1.67%
0   1        66.38%              1.53%


#### Selecting knn == 17

In [28]:
knn17 = KNeighborsClassifier(n_neighbors=17).fit(X_train_norm, Y_train.MOVED_A)
print('KNN == 9')
classificationSummary(Y_valid, knn17.predict(X_valid_norm))

KNN == 9
Confusion Matrix (Accuracy 0.7009)

       Prediction
Actual    0    1
     0 2029  542
     1  670  811


## Naive Bayes

In [29]:
vote_nb = MultinomialNB(alpha=1)
vote_nb.fit(X_train_norm, Y_train.MOVED_A)

# predict probabilities
predProb_train = vote_nb.predict_proba(X_train_norm)
predProb_valid = vote_nb.predict_proba(X_valid_norm)

# predict class membership
y_train_pred = vote_nb.predict(X_train_norm)
y_valid_pred = vote_nb.predict(X_valid_norm)

In [30]:
print('Test Data Set')
classificationSummary(Y_train, y_train_pred, class_names=classes) 
print()
print('Validation Data Set')
classificationSummary(Y_valid, y_valid_pred, class_names=classes) 

Test Data Set
Confusion Matrix (Accuracy 0.6606)

       Prediction
Actual    0    1
     0 3386  309
     1 1710  543

Validation Data Set
Confusion Matrix (Accuracy 0.6671)

       Prediction
Actual    0    1
     0 2354  217
     1 1132  349


predictors_census
predictors_commercial
predictors_voterfile