# SD3 Dataset Analysis [PAIR Section 2]

IN-PROGRESS

Introduction text and visuals (including graphs) to be added later.

In [1]:
### Libraries/Packages
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
### Loading the dataset
df = pd.read_csv('SD3/data.csv', delimiter = '\t')

In [3]:
### Cleaning the Data

# removing duplicate rows
df.drop_duplicates(inplace = True)

# removing the source (it refers to how the respondent completed the survey) and
# country columns
df.drop(columns = ["source", "country"], inplace = True)

# removing rows with missing values
df.dropna(inplace = True)

# replacing the 0s with 3s (I am removing invalid values and replace with the median)
df.replace(0,3, inplace=True)

# Some of the columns have their values invert, meaning that a low score indicates a stronger
# level of agreement with the question's associated trait. To standardize the interpretation
# across all columns, I will flip the values of a select number of columns
flip_cols = ['N2', 'N6', 'N8', 'P2', 'P4', 'P7']
for col in flip_cols:
    for i in df[col]:
        if df[col][i] == 1:
            df[col][i] = 5
        elif df[col][i] == 5:
            df[col][i] = 1
        elif df[col][i] == 4:
            df[col][i] = 2
        elif df[col][i] == 2:
            df[col][i] = 4


In [4]:
### Feature Engineering

# narcissism questions only dataframe
n_df = df.filter(regex = '^N')

# machiavellianism questions only dataframe
m_df = df.filter(regex = '^M')

# psychopathy questions only dataframe
p_df = df.filter(regex = '^P')

# calculating the average score per trait per user (I am also rounding to 3 significant digits)
n_avg_list = n_df.mean(axis = 1).round(3)
m_avg_list = m_df.mean(axis = 1).round(3)
p_avg_list = p_df.mean(axis = 1).round(3)

# Making lists that mark whether a person is within, above, or below the average range
# (between 25th and 75th percentiles). A 1 means above average range, 0 is within, and
# -1 is below.

'''
I may later adjust this code to have it loop through the lists only once to improve runtime.
'''

n_cat_list = [] # narcissism category list
for i in n_avg_list:
    if i < n_avg_list.describe()['25%']:
        n_cat_list.append(-1)
    elif i >= n_avg_list.describe()['25%'] and i < n_avg_list.describe()['75%']:
        n_cat_list.append(0)
    else:
        n_cat_list.append(1)

m_cat_list = []
for i in m_avg_list:
    if i < m_avg_list.describe()['25%']:
        m_cat_list.append(-1)
    elif i >= n_avg_list.describe()['25%'] and i < m_avg_list.describe()['75%']:
        m_cat_list.append(0)
    else:
        m_cat_list.append(1)

p_cat_list = []
for i in p_avg_list:
    if i < p_avg_list.describe()['25%']:
        p_cat_list.append(-1)
    elif i >= p_avg_list.describe()['25%'] and i < p_avg_list.describe()['75%']:
        p_cat_list.append(0)
    else:
        p_cat_list.append(1)


In [None]:
### Adding the averages and category lists to the dataframe
df['Narcissism_Category'] = n_cat_list
df['Machiavellianism_Category'] = m_cat_list
df['Psychopathy_Category'] = p_cat_list
df['Narcissism_Avg'] = n_avg_list
df['Machiavellianism_Avg'] = m_avg_list
df['Psychopathy_Avg'] = p_avg_list

In [None]:
df

Unnamed: 0,M1,M2,M3,M4,M5,M6,M7,M8,M9,N1,...,P6,P7,P8,P9,Narcissism_Category,Machiavellianism_Category,Psychopathy_Category,Narcissism_Avg,Machiavellianism_Avg,Psychopathy_Avg
0,4,4,4,4,4,4,4,3,4,2,...,4,4,4,4,0,0,1,3.111,3.889,3.556
1,2,1,5,2,2,1,2,2,3,1,...,1,1,3,2,-1,-1,-1,2.444,2.222,1.667
2,3,3,3,5,1,1,5,5,3,2,...,1,2,3,1,-1,0,-1,1.556,3.222,2.444
3,5,5,4,5,5,5,5,5,5,5,...,5,1,1,5,1,1,1,4.556,4.889,3.778
4,4,4,2,5,5,5,4,1,4,3,...,3,1,4,1,0,0,0,2.889,3.778,3.333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18187,1,5,2,4,4,5,4,3,5,4,...,3,2,3,3,0,0,0,3.000,3.667,3.111
18188,5,4,5,4,5,4,5,4,5,4,...,2,2,3,4,1,1,0,3.778,4.556,2.667
18189,4,3,3,4,2,3,4,3,4,3,...,1,2,4,1,0,0,-1,3.000,3.333,2.222
18190,5,4,3,3,4,4,4,3,4,3,...,3,4,1,3,1,0,0,3.444,3.778,3.222
