In [1]:
import seaborn as sns
import numpy as np
from numpy import array
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder

# create column names
df2 = pd.read_csv("drug_consumption.csv", 
                  names=["Age", "Gender", "Education", "Country", "Ethnicity", "Nscore", "Escore", "Oscore", "Ascore", "Cscore", "Impulsive", "SS", "Alcohol", "Amphet", "Amyl", "Benzos", "Caff", "Cannabis", "Choc", "Coke", "Crack", "Ecstacy", "Heroin", "Ketamine", "Legalh", "LSD", "Meth", "Mushrooms", "Nicotine", "Semer", "VSA"])

pd.DataFrame.head(df2)

Unnamed: 0,Age,Gender,Education,Country,Ethnicity,Nscore,Escore,Oscore,Ascore,Cscore,...,Ecstacy,Heroin,Ketamine,Legalh,LSD,Meth,Mushrooms,Nicotine,Semer,VSA
1,0.49788,0.48246,-0.05921,0.96082,0.126,0.31287,-0.57545,-0.58331,-0.91699,-0.00665,...,CL0,CL0,CL0,CL0,CL0,CL0,CL0,CL2,CL0,CL0
2,-0.07854,-0.48246,1.98437,0.96082,-0.31685,-0.67825,1.93886,1.43533,0.76096,-0.14277,...,CL4,CL0,CL2,CL0,CL2,CL3,CL0,CL4,CL0,CL0
3,0.49788,-0.48246,-0.05921,0.96082,-0.31685,-0.46725,0.80523,-0.84732,-1.6209,-1.0145,...,CL0,CL0,CL0,CL0,CL0,CL0,CL1,CL0,CL0,CL0
4,-0.95197,0.48246,1.16365,0.96082,-0.31685,-0.14882,-0.80615,-0.01928,0.59042,0.58489,...,CL0,CL0,CL2,CL0,CL0,CL0,CL0,CL2,CL0,CL0
5,0.49788,0.48246,1.98437,0.96082,-0.31685,0.73545,-1.6334,-0.45174,-0.30172,1.30612,...,CL1,CL0,CL0,CL1,CL0,CL0,CL2,CL2,CL0,CL0


In [2]:
arr = ["Alcohol", "Amphet", "Amyl", "Benzos", "Caff", "Cannabis", "Choc", "Coke", "Crack", "Ecstacy", "Heroin", "Ketamine", "Legalh", "LSD", "Meth", "Mushrooms", "Nicotine", "Semer", "VSA"]
# remove CL and convert strings to ints
# Use frequency categories:
# CL0: Never used (0)
# CL1-2: Used to use (1)
# CL3-6: Current user (2)
for i in range(len(arr)):
    df2[arr[i]].replace({"CL0": "0", "CL1": "1", "CL2": "1", "CL3": "2", "CL4": "2", "CL5": "2", "CL6": "2"}, inplace=True)
    df2[arr[i]] = pd.to_numeric(df2[arr[i]])

In [3]:
# ML&Algo Part
# MLPRegression for all Personality Scores Together

# What drugs are associated with what personalities?
# X = particular drug consumption, y = personality measurements
from sklearn.preprocessing import MinMaxScaler
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor

x_unscaled = df2[["Alcohol", "Amphet", "Cannabis", "Coke", "Ecstacy", "LSD", "Meth", "Mushrooms"]]
y_unscaled = df2[["Nscore", "Escore", "Oscore", "Ascore", "Cscore", "Impulsive", "SS"]]

scalerx = MinMaxScaler()
scalerx.fit(x_unscaled)
x = scalerx.transform(x_unscaled)

scalery = MinMaxScaler()
scalery.fit(y_unscaled)
y = scalery.transform(y_unscaled)

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 10, test_size = 0.1)


clf = MLPRegressor(hidden_layer_sizes=(12,3),activation="logistic",solver = 'sgd', max_iter = 500, random_state=1, learning_rate_init = 0.3)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test) 

print("MLPRegression for all Personality Scores Together", "\n")
print("Model Accuracy:", clf.score(x_test, y_test), "\n")


MLPRegression for all Personality Scores Together 

Model Accuracy: 0.06689873446104302 



In [4]:
# MLPRegression for Individual Personality Scores
# X = particular drug consumption, y = one personality measurement

x = df2[["Alcohol", "Amphet", "Cannabis", "Coke", "Ecstacy", "LSD", "Meth", "Mushrooms"]]

clf = MLPRegressor(hidden_layer_sizes=(12,3),activation="logistic",solver = 'sgd', max_iter = 500, random_state=1, learning_rate_init = 0.3)

print("MLPRegression for Individual Personality Scores", "\n")
personalityScoresList = ["Nscore", "Escore", "Oscore", "Ascore", "Cscore", "Impulsive", "SS"]
for personalityScore in personalityScoresList:
    y = df2[personalityScore]

    x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 10, test_size = 0.1)

    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print(personalityScore)
    print("Model Accuracy:", clf.score(x_test, y_test), "\n")


MLPRegression for Individual Personality Scores 

Nscore
Model Accuracy: 0.005439078087634441 

Escore
Model Accuracy: -0.05073664087754537 

Oscore
Model Accuracy: 0.21162688914795258 

Ascore
Model Accuracy: 0.04122583528497803 

Cscore
Model Accuracy: 0.033293899264358884 

Impulsive
Model Accuracy: 0.05754620811692979 

SS
Model Accuracy: 0.19774279616813983 



In [5]:
# Linear Regression for Individual Personality Scores
# X = particular drug consumption, y = one personality measurement

from sklearn.linear_model import LinearRegression
x = df2[["Alcohol", "Amphet", "Cannabis", "Coke", "Ecstacy", "LSD", "Meth", "Mushrooms"]]

clf = LinearRegression()

print("Linear Regression for Individual Personality Scores", "\n")
personalityScoresList = ["Nscore", "Escore", "Oscore", "Ascore", "Cscore", "Impulsive", "SS"]
for personalityScore in personalityScoresList:
    y = df2[personalityScore]

    x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 10, test_size = 0.1)

    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print(personalityScore)
    print("Model Accuracy:", clf.score(x_test, y_test), "\n")


Linear Regression for Individual Personality Scores 

Nscore
Model Accuracy: 0.022075133477591158 

Escore
Model Accuracy: 0.0049869706099553435 

Oscore
Model Accuracy: 0.23771574934894768 

Ascore
Model Accuracy: 0.03663903017916903 

Cscore
Model Accuracy: 0.05837975003199514 

Impulsive
Model Accuracy: 0.05363604711106362 

SS
Model Accuracy: 0.20341282490569823 



In [6]:
# Epsilon-Support Vector Regression (kernel = rbf)

from sklearn import svm

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 10, test_size = 0.1)

regr = svm.SVR(kernel='rbf', degree=3)
regr.fit(x_train, y_train)
print("Epsilon-Support Vector Regression (kernel = rbf)", "\n")
print("Model Accuracy:", regr.score(x_test, y_test))

Epsilon-Support Vector Regression (kernel = rbf) 

Model Accuracy: 0.20996526096755797


In [7]:
# Epsilon-Support Vector Regression (kernel = linear)

from sklearn import svm

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 10, test_size = 0.1)

regr = svm.SVR(kernel='linear', degree=3)
regr.fit(x_train, y_train)
print("Epsilon-Support Vector Regression (kernel = linear)", "\n")
print("Model Accuracy:", regr.score(x_test, y_test))

Epsilon-Support Vector Regression (kernel = linear) 

Model Accuracy: 0.20103058829742537
