# Dependencies

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Data Review and Cleaning

In [None]:
file = "../Resources/cleaned_drug_data.csv"
df = pd.read_csv(file)
df.head()

In [None]:
df.columns

In [None]:
df = df.drop(["ID", 'Alcohol',
       'Amphet', 'Amyl', 'Benzos', 'Caff', 'Cannabis', 'Choc', 'Coke', 'Crack',
       'Ecstasy', 'Heroin', 'Ketamine', 'Legalh', 'LSD', 'Meth', 'Mushrooms',
       'Nicotine', 'VSA', 'illegal_score_sum', 'legal_score_sum', 'legal_use' ], axis=1)

In [None]:
df.Age.value_counts()

Note: There are 6 categories for Age

In [None]:
df.Education.value_counts()

Note: There are 9 categories for Education

In [None]:
df.Country.value_counts()

Note: There are 7 categories for Country

In [None]:
df.Ethnicity.value_counts()

Note: There are 7 categories for Ethnicity

In [None]:
df.isnull().sum()

## Encoding Categorical Columns

In [None]:
# create list of dtypes to help determine unique values
df_cat=df.dtypes[df.dtypes == "object"].index.tolist()

In [None]:
# display number of unique values
df[df_cat].nunique()

In [None]:
#import onehotencoder
from sklearn.preprocessing import OneHotEncoder

# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(df[df_cat]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names_out(df_cat)
encode_df

## Merge to Create One Dataset

In [None]:
# merge original df and encoded and then drop the categorical columns to leave only numerical columns
df_merge = df.merge(encode_df, left_index=True, right_index=True)
df_merge = df_merge.drop(df_cat, axis='columns')
df_merge.head()

## Split Data into Target and Features

In [None]:
# create dataframe for variable X
df_pre = df_merge.drop("illegal_use", axis = 1)

In [None]:
# split data
X = df_pre
y = df["illegal_use"]

## Train Test Split and Scale Data

In [None]:
# Split the data into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state =1)

In [None]:
# scale data
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
clf = RandomForestClassifier(random_state=1, n_estimators=100, max_depth=10).fit(X_train_scaled, y_train)
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

In [None]:
feature_importances = clf.feature_importances_

In [None]:
features = sorted(zip(X.columns, clf.feature_importances_), key = lambda x: x[1])
cols = [f[0] for f in features]
width = [f[1] for f in features]

fig, ax = plt.subplots()

fig.set_size_inches(10,20)
plt.margins(y=0.001)

ax.barh(y=cols, width=width)

plt.show()

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf

In [None]:
# fit the logisticregression model
clf.fit(X_train, y_train)

In [None]:
# score the training and test models and print the scores
print(f"Training Data Score: {clf.score(X_train, y_train)}")
print(f"Testing Data Score: {clf.score(X_test, y_test)}")

In [None]:
# create a confusion matrix for this sample
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

predictions = clf.predict(X_test)
cm = confusion_matrix(y_test, predictions, labels = clf.classes_)
cm_plot = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = clf.classes_)

cm_plot.plot()
plt.show()

## Remove Ethnicity Attribute

In [None]:
df1 = df.drop(['Ethnicity'], axis=1)
df1

In [None]:
# create list of dtypes to help determine unique values
df1_cat=df1.dtypes[df1.dtypes == "object"].index.tolist()

In [None]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df1 = pd.DataFrame(enc.fit_transform(df1[df1_cat]))

# Add the encoded variable names to the dataframe
encode_df1.columns = enc.get_feature_names_out(df1_cat)
encode_df1

In [None]:
# merge original df and encoded and then drop the categorical columns to leave only numerical columns
df1_merge = df1.merge(encode_df1, left_index=True, right_index=True)
df1_merge = df1_merge.drop(df1_cat, axis='columns')
df1_merge.head()

In [None]:
# create dataframe for variable X
df1_pre = df1_merge.drop("illegal_use", axis = 1)

In [None]:
# split data
X = df1_pre
y = df["illegal_use"]

In [None]:
# Split the data into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state =1)

In [None]:
# scale data
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
clf = RandomForestClassifier(random_state=1, n_estimators=100, max_depth=11).fit(X_train_scaled, y_train)
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

## Remove Ethnicity and Country Attributes

In [None]:
df2 = df.drop(['Ethnicity', 'Country'], axis=1)
df2

In [None]:
# create list of dtypes to help determine unique values
df2_cat=df2.dtypes[df2.dtypes == "object"].index.tolist()

In [None]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df2 = pd.DataFrame(enc.fit_transform(df2[df2_cat]))

# Add the encoded variable names to the dataframe
encode_df2.columns = enc.get_feature_names_out(df2_cat)
encode_df2

In [None]:
# merge original df and encoded and then drop the categorical columns to leave only numerical columns
df2_merge = df2.merge(encode_df2, left_index=True, right_index=True)
df2_merge = df2_merge.drop(df2_cat, axis='columns')
df2_merge.head()

In [None]:
# create dataframe for variable X
df2_pre = df2_merge.drop("illegal_use", axis = 1)

In [None]:
# split data
X = df2_pre
y = df["illegal_use"]

In [None]:
# Split the data into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state =1)

In [None]:
# scale data
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
clf = RandomForestClassifier(random_state=1, n_estimators=100, max_depth=6).fit(X_train_scaled, y_train)
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')