<a href="https://colab.research.google.com/github/vanderbilt-ml/50-Crook-mlproj-Honesty/blob/main/HonestyNotebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Project Overview
This project will attempt to predict a correlation between a person's religion and whether they display personality traits indicitive of Machiavellian tendencies. Included in this prediction will be whether the person considers themselves more introverted or extroverted.

## Data Set
Machivallianism Test on Kaggle

## Performance Measures
This will be based on a percentage of people with more honest traits and what their religious preference is along with whether they identify as being more reserved or extroverted.

# Feature Engineering

In [15]:
#tables and visualizations
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

#machine learning
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline 
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer, StandardScaler, OrdinalEncoder
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn import config_context
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn import svm
from sklearn.model_selection import cross_val_score

In [16]:
data = pd.read_csv('https://github.com/vanderbilt-ml/50-Crook-mlproj-Honesty/blob/main/data.csv?raw=true', delimiter='\t')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73489 entries, 0 to 73488
Columns: 105 entries, Q1A to major
dtypes: float64(64), int64(39), object(2)
memory usage: 58.9+ MB


# Compute Scores, Delete Columns and Add Scores columns
## There is a list of 20 questions in the data. We don't necessarily care about the questions, but rather the score of adding all the question answers together. Therefore, we need to calculate those scores, correct them if necessary, remove the questions columns and add a score column.

In [19]:
# Declare the variables we need
scores = []
current_score = 0
current_column = ""

prefix = "Q"
a_suffix = "A"
i_suffix = "I"
e_suffix = "E"

# Begin our loop by iterating through the rows
for i in range(len(data)) :
  # Zeroize the current_score
  current_score = 0
  # Add the answers to questions in each row
  for x in range(20):
    current_column = prefix + str(x+1) + a_suffix
    current_score = current_score + data.loc[i, current_column]
  # Add the score to the list
  scores.append(current_score)

# Print, just so we know everything went okay
#for i in range(len(scores)) :
#  print(scores[i])

# Check and make sure we have no outliers
outliers = 0
for i in range(len(scores)) :
  if (scores[i] < 20 or scores[i] > 100) :
    outliers = outliers + 1

print("Outliers " + str(outliers))

Outliers 0


In [20]:
# Now we can remove the columns we don't need
current_a = ""
current_i = ""
current_e = ""

for i in range(20) :
  current_a = prefix + str(i+1) + a_suffix
  current_i = prefix + str(i+1) + i_suffix
  current_e = prefix + str(i+1) + e_suffix
  #print(current_a + " " + current_i + " " + current_e)
  data.drop([current_a], inplace=True, axis=1)
  data.drop([current_i], inplace=True, axis=1)
  data.drop([current_e], inplace=True, axis=1)

# Now add the Scores column
data['Score'] = scores

In [21]:
# Print the data after the conversion
print(data)

      country  introelapse  testelapse  surveyelapse  TIPI1  TIPI2  TIPI3  \
0          GB         49.0       328.0           426      6      5      6   
1          US         38.0       143.0           150      2      5      6   
2          US          4.0       143.0           157      1      7      6   
3          CH         60.0       191.0           269      6      5      5   
4          NL         37.0       302.0           334      2      5      5   
...       ...          ...         ...           ...    ...    ...    ...   
73484      US          6.0       494.0           340      6      4      5   
73485      PL          6.0       295.0           140      5      7      5   
73486      RO         11.0       111.0           122      1      6      1   
73487    NONE         56.0       269.0           198      5      4      5   
73488      US         24.0       198.0           197      6      7      6   

       TIPI4  TIPI5  TIPI6  ...  screenh  hand  religion  orientation  race

In [22]:
# Check for missing values
data.isnull().sum()

country            12
introelapse         3
testelapse          3
surveyelapse        0
TIPI1               0
TIPI2               0
TIPI3               0
TIPI4               0
TIPI5               0
TIPI6               0
TIPI7               0
TIPI8               0
TIPI9               0
TIPI10              0
VCL1                0
VCL2                0
VCL3                0
VCL4                0
VCL5                0
VCL6                0
VCL7                0
VCL8                0
VCL9                0
VCL10               0
VCL11               0
VCL12               0
VCL13               0
VCL14               0
VCL15               0
VCL16               0
education           0
urban               0
gender              0
engnat              0
age                 0
screenw             3
screenh             3
hand                0
religion            0
orientation         0
race                0
voted               0
married             0
familysize          0
major           26338
Score     

# Split the Data

In [23]:
class_column = 'religion'
random_seed = 2435

X_train, X_test, y_train, y_test = train_test_split(data.drop(columns=class_column), data[class_column],
                                                   test_size=0.2, random_state=random_seed, stratify=data[class_column])

Sanity Check

In [24]:
# X Train
print('On X train: ')
print('X train dimensions: ', X_train.shape)
display(X_train.head())

# X test
print('\nOn X test: ')
print('X test dimensions: ', X_test.shape)
display(X_test.head())

On X train: 
X train dimensions:  (58791, 45)


Unnamed: 0,country,introelapse,testelapse,surveyelapse,TIPI1,TIPI2,TIPI3,TIPI4,TIPI5,TIPI6,...,screenw,screenh,hand,orientation,race,voted,married,familysize,major,Score
37141,DO,4.0,189.0,200,3,3,3,5,5,4,...,320.0,570.0,1,1,30,1,1,3,Biology,58.0
33754,US,4.0,142.0,138,3,3,5,2,5,5,...,375.0,667.0,1,1,60,1,1,2,Political Science,61.0
57441,CA,8.0,259.0,225,4,5,7,2,7,5,...,1280.0,800.0,1,1,60,1,3,3,business,62.0
50056,AU,7.0,114.0,143,7,6,6,1,6,2,...,1366.0,768.0,1,1,60,2,1,1,,54.0
15970,US,48.0,178.0,216,6,1,4,1,6,6,...,768.0,1024.0,1,1,60,2,1,7,,62.0



On X test: 
X test dimensions:  (14698, 45)


Unnamed: 0,country,introelapse,testelapse,surveyelapse,TIPI1,TIPI2,TIPI3,TIPI4,TIPI5,TIPI6,...,screenw,screenh,hand,orientation,race,voted,married,familysize,major,Score
21412,US,3.0,167.0,264,3,1,7,4,2,5,...,768.0,1024.0,1,3,60,1,1,4,,65.0
52014,ID,18158.0,173.0,204,1,5,5,6,5,7,...,393.0,786.0,1,1,10,1,1,3,Civil Engineering,73.0
53316,US,2.0,219.0,208,3,5,7,2,6,3,...,1920.0,1080.0,1,1,60,1,2,4,,60.0
55824,GB,12.0,147.0,197,4,5,7,2,5,5,...,486.0,729.0,1,1,60,2,1,2,Accountancy,56.0
27974,CL,63.0,498.0,257,5,6,6,7,7,3,...,1366.0,768.0,1,1,70,2,1,2,,57.0


Create Pipelines

In [31]:
#individual pipelines for differing datatypes
cat_pipeline = Pipeline(steps=[('cat_impute', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
                               ('onehot_cat', OneHotEncoder(drop='if_binary', handle_unknown='ignore'))])
num_pipeline = Pipeline(steps=[('impute_num', SimpleImputer(missing_values=np.nan, strategy='mean')),
                               ('scale_num', StandardScaler())])

In [32]:
#establish preprocessing pipeline by columns
preproc = ColumnTransformer([('cat_pipe', cat_pipeline, make_column_selector(dtype_include=object)),
                             ('num_pipe', num_pipeline, make_column_selector(dtype_include=np.number))],
                             remainder='passthrough')

In [33]:
#generate the whole modeling pipeline with preprocessing
LRpipe = Pipeline(steps=[('preproc', preproc),
                       ('mdl', LogisticRegression(penalty='elasticnet', solver='saga', tol=0.01))])

#visualization for steps
with config_context(display='diagram'):
    display(LRpipe)

# Assignment 5
## Explore 3 different models in your ML pipeline for your personal project

In [34]:
# Set up Random Forest Pipeline
randomForest_pipe = Pipeline(steps=[('preproc', preproc),
                       ('mdl', RandomForestClassifier())])

#visualization for steps
with config_context(display='diagram'):
    display(randomForest_pipe)

# Set up Naive Bayes classifier for multinomial models
naiveBayes_pipe = Pipeline(steps=[('preproc', preproc),
                       ('mdl', MultinomialNB())])

#visualization for steps
with config_context(display='diagram'):
    display(naiveBayes_pipe)

# Cross Validation with Hyperparameter Tuning

In [35]:
# Set up tuning grids
logisticRegression_tuning_grid = {'mdl__l1_ratio' : np.linspace(0,100,50),
               'mdl__C': np.logspace(0, 100, 50) }

randomForest_tuning_grid = {'mdl__n_estimators' : [100, 200 ,500],
               'mdl__max_depth': [10, 15, 20] }

NB_tuning_grid = RepeatedStratifiedKFold(n_splits=5,  n_repeats=3, random_state=999)

In [38]:
# fit the models
#logisticRegression_grid_search = GridSearchCV(LRpipe, param_grid = logisticRegression_tuning_grid, cv = 5, return_train_score=True)
#logisticRegression_grid_search.fit(X_train, y_train)

# Now let's do the Random Forest Classifier
#randomForest_grid_search = GridSearchCV(randomForest_pipe, param_grid = randomForest_tuning_grid, cv = 5, return_train_score=True)
#randomForest_grid_search.fit(X_train, y_train)

# Now let's do the Random Forest Classifier
#NB_grid_search = GridSearchCV(naiveBayes_pipe, param_grid = NB_tuning_grid, cv = 5, return_train_score=True)
#NB_grid_search.fit(X_train, y_train)