In [None]:
# Import pandas
import pandas as pd

In [None]:
# Create connection to data source
file_path = ("../Resources/cleaned_drug_data.csv")
df = pd.read_csv(file_path)

df

In [None]:
# Display DataFrame columns
df.columns

In [None]:
# Create a new DataFrame with desired variables
filtered_df = df[["Age", "Gender", "Education", "Country", "Ethnicity", "Nscore", "Escore", "Oscore", "AScore", "Cscore",\
                    "Impulsive", "SS", "illegal_use"]]

In [None]:
# Import sklearn dependencies
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [None]:
# Create features and target variables
X = filtered_df.drop("illegal_use", axis = 1)
y = filtered_df["illegal_use"]

In [None]:
# Encode categorical data
X_dummies = pd.get_dummies(X)

In [None]:
# Display new DataFrame
X_dummies

In [None]:
# Import PCA dependency
from sklearn.decomposition import PCA

In [None]:
# Set up feature reduction to account for 90% variance
pca = PCA(n_components = .90)

In [None]:
# Transform the data
pca_array = pca.fit_transform(X_dummies)

In [None]:
# Display the variance attributed to each new feature
pca.explained_variance_ratio_

In [None]:
# Create training and testing sets
X_train, X_test, y_train, y_test = train_test_split(pca_array, y)

In [None]:
# Scale the data
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Logistic Regression with feature reduction (PCA)

In [None]:
# Create the logistic regression classifier
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()

In [None]:
# Fit the classifier to the data
classifier.fit(X_train_scaled, y_train)

In [None]:
# Display model results
print(f"Training Accuracy: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Accuracy: {classifier.score(X_test_scaled, y_test)}")

In [None]:
# Create a confusion matrix graphic
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

predictions = classifier.predict(X_test)
cm = confusion_matrix(y_test, predictions, labels = classifier.classes_)
cm_plot = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = classifier.classes_)

cm_plot.plot()
plt.show()

# Random Forest with feature reduction (PCA)

In [None]:
# Import RF dependency
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Create classifier and score model
clf = RandomForestClassifier(n_estimators = 500, max_depth = 10).fit(X_train_scaled, y_train)
print(f"Training Accuracy: {clf.score(X_train_scaled, y_train)}")
print(f"Testing Accuracy: {clf.score(X_test_scaled, y_test)}")

In [None]:
# Copy the DataFrame
copy_df = X_dummies.copy()
copy_df.head()

In [None]:
# Use this copy to filter demographic variables out
personality_df = copy_df[["Nscore", "Escore", "Oscore", "AScore", "Cscore", "Impulsive", "SS"]]

In [None]:
# Display new personality DataFrame
personality_df

In [None]:
# Create new training and testing sets
X_train, X_test, y_train, y_test = train_test_split(personality_df, y)

In [None]:
# Scale personality data
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Logistic Regression with only personality variables

In [None]:
# Create another logistic regression classifier
classifier = LogisticRegression()

In [None]:
# Fit classifier to data
classifier.fit(X_train_scaled, y_train)

In [None]:
# Print model scores for personality data
print(f"Training Accuracy: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Accuracy: {classifier.score(X_test_scaled, y_test)}")

In [None]:
# Show confusion matrix
predictions = classifier.predict(X_test)
cm = confusion_matrix(y_test, predictions, labels = classifier.classes_)
cm_plot = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = classifier.classes_)

cm_plot.plot()
plt.show()

# Random Forest with only personality variables

In [None]:
# Import RFC
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Fit model and display results
clf = RandomForestClassifier(n_estimators = 500, max_depth = 10).fit(X_train_scaled, y_train)
print(f"Training Accuracy: {clf.score(X_train_scaled, y_train)}")
print(f"Testing Accuracy: {clf.score(X_test_scaled, y_test)}")