Model using MultiOutput Classifier  

In [15]:
import pandas as pd
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report

# Load and preprocess data
csv_url = 'https://raw.githubusercontent.com/zeroday-zaddy/cs422-project/c067b005a7f9a90ae114357cfe7948ed828dc07a/data/leafly_strain_data.csv'
warnings.filterwarnings('ignore')
df = pd.read_csv(csv_url)

# Drop rows where 'name' is blank
df = df.dropna(subset=['name'])

# Handling missing values
df[['happy', 'euphoric', 'uplifted', 'sleepy', 'dry_mouth', 'dry_eyes', 'dizzy', 'paranoid', 'anxious', 'stress', 'pain', 'depression', 'anxiety', 'insomnia']] = df[['happy', 'euphoric', 'uplifted', 'sleepy', 'dry_mouth', 'dry_eyes', 'dizzy', 'paranoid', 'anxious', 'stress', 'pain', 'depression', 'anxiety', 'insomnia']].fillna(0)

# Extract features (X) and target variables (y)
X = df[['most_common_terpene']]
y = df[['relaxed', 'happy', 'euphoric', 'uplifted', 'sleepy', 'dry_mouth', 'dry_eyes', 'dizzy', 'paranoid', 'anxious', 'stress', 'pain', 'depression', 'anxiety', 'insomnia']]

# Encode categorical variables
encoder = OneHotEncoder(sparse_output=False)
X_encoded = encoder.fit_transform(X[['most_common_terpene']])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Train a machine learning model using MultiOutputClassifier
base_classifier = DecisionTreeClassifier(random_state=42)
model = MultiOutputClassifier(base_classifier)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate model performance for each target variable
for i, effect in enumerate(y.columns):
    print(f"Effect: {effect}")
    print(classification_report(y_test[effect], y_pred[:, i], zero_division='warn'))
    print("=" * 50)


Effect: relaxed
              precision    recall  f1-score   support

          0%       0.44      1.00      0.61       418
        100%       0.00      0.00      0.00        68
         12%       0.00      0.00      0.00         1
         16%       0.00      0.00      0.00         1
         18%       0.00      0.00      0.00         2
         20%       0.00      0.00      0.00         1
         22%       0.00      0.00      0.00         3
         23%       0.00      0.00      0.00         1
         25%       0.00      0.00      0.00         7
         27%       0.00      0.00      0.00         2
         28%       0.00      0.00      0.00         3
         30%       0.00      0.00      0.00         1
         31%       0.00      0.00      0.00         1
         32%       0.00      0.00      0.00         1
         33%       0.00      0.00      0.00        10
         35%       0.00      0.00      0.00         3
         36%       0.00      0.00      0.00         1
         37

Model based on Random Forest & Decision Tree
#Cross Validation Included

In [45]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score

# Load your marijuana project data from a CSV file
# Replace 'marijuana_data.csv' with the actual path to your CSV file
df = pd.read_csv('https://raw.githubusercontent.com/zeroday-zaddy/cs422-project/main/data/strains_cleaned.csv')


# Drop rows with missing values
df = df.dropna()

# Assuming 'Target' is the name of your target column
X = df['Terpene']
y = df['Main_Effect']

# One-hot encode categorical columns in X
# Replace 'categorical_columns' with the actual names of your categorical columns
categorical_columns = ['Main_Effect', 'Terpene']
X_encoded = pd.get_dummies(X, columns=categorical_columns)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, stratify=y, random_state=42)
#---------------------------------------------
# Create and train Random Forest Classifier
model = make_pipeline(SimpleImputer(strategy='mean'), RandomForestClassifier(random_state=0))

# Create and train the Decision Tree Classifier
#tree = DecisionTreeClassifier(random_state=0)
model.fit(X_train, y_train)
#-----------------------------------------------
print("Accuracy on training set: {:.3f}".format(model.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(model.score(X_test, y_test)))

# Example of using cross-validation to get a more reliable estimate
cv_scores = cross_val_score(model, X_encoded, y, cv=5)
print("Cross-validated accuracy: {:.3f}".format(cv_scores.mean()))

Accuracy on training set: 0.226
Accuracy on test set: 0.222
Cross-validated accuracy: 0.213


#Example using Logistic Regression

In [46]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder

# Load your marijuana project data from a CSV file
# Replace 'marijuana_data.csv' with the actual path to your CSV file
df = pd.read_csv('https://raw.githubusercontent.com/zeroday-zaddy/cs422-project/main/data/strains_cleaned.csv')

# Drop rows with missing values
df = df.dropna()

# Assuming 'Target' is the name of your target column
X = df['Terpene']
y = df['Main_Effect']

# One-hot encode categorical columns in X
# Replace 'categorical_columns' with the actual names of your categorical columns
categorical_columns = ['Main_Effect', 'Terpene']
X_encoded = pd.get_dummies(X, columns=categorical_columns)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, stratify=y, random_state=42)

# Create and train the Logistic Regression model
logreg_model = LogisticRegression(random_state=0, max_iter=1000)
logreg_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = logreg_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy on test set: {:.3f}".format(accuracy))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy on test set: 0.236
Classification Report:
               precision    recall  f1-score   support

     Aroused       0.00      0.00      0.00         5
    Creative       0.00      0.00      0.00         5
   Energetic       0.33      0.25      0.29        20
    Euphoric       0.00      0.00      0.00        10
     Focused       0.00      0.00      0.00         7
      Giggly       0.00      0.00      0.00        10
       Happy       0.00      0.00      0.00         7
      Hungry       0.00      0.00      0.00        11
     Relaxed       0.00      0.00      0.00        17
      Sleepy       0.22      0.97      0.36        30
   Talkative       0.00      0.00      0.00         7
      Tingly       0.00      0.00      0.00         7
    Uplifted       0.00      0.00      0.00         8

    accuracy                           0.24       144
   macro avg       0.04      0.09      0.05       144
weighted avg       0.09      0.24      0.12       144



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Random Forest using more X features


In [52]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Load data from CSV
df = pd.read_csv('https://raw.githubusercontent.com/zeroday-zaddy/cs422-project/main/data/strains_cleaned.csv')

# Drop rows with missing values
df = df.dropna()

# Define features (X) and target variable (y)
X_columns = ['Type', 'Rating', 'Num_Reviews', 'THC%', 'Other_Cannabinoids', 'Terpene']
y_column = 'Main_Effect'

X = df[X_columns]
y = df[y_column]

# Encode categorical variables
X_encoded = pd.get_dummies(X)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, stratify=y, random_state=42)

# Create and train a Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)

# Make predictions
y_pred = rf_classifier.predict(X_test)

# Evaluate the model
print("Accuracy on test set:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))


Accuracy on test set: 0.3263888888888889
Classification Report:
              precision    recall  f1-score   support

     Aroused       0.00      0.00      0.00         5
    Creative       0.00      0.00      0.00         5
   Energetic       0.54      0.70      0.61        20
    Euphoric       0.25      0.20      0.22        10
     Focused       0.00      0.00      0.00         7
      Giggly       0.00      0.00      0.00        10
       Happy       0.00      0.00      0.00         7
      Hungry       0.00      0.00      0.00        11
     Relaxed       0.27      0.18      0.21        17
      Sleepy       0.49      0.83      0.62        30
   Talkative       0.00      0.00      0.00         7
      Tingly       0.20      0.14      0.17         7
    Uplifted       0.33      0.25      0.29         8

    accuracy                           0.33       144
   macro avg       0.16      0.18      0.16       144
weighted avg       0.25      0.33      0.28       144



Example using Binary Classification to determine if Main Effect is Sleepy

In [53]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Load data from CSV
df = pd.read_csv('https://raw.githubusercontent.com/zeroday-zaddy/cs422-project/main/data/strains_cleaned.csv')

# Drop rows with missing values
df = df.dropna()

# Define features (X) and binary target variable (y)
X_columns = ['Type', 'Rating', 'Num_Reviews', 'THC%', 'Other_Cannabinoids', 'Terpene']
y_column = 'Main_Effect'

# Binary classification: 1 if Main_Effect is "Sleepy", 0 otherwise
df['Binary_Target'] = (df[y_column] == 'Sleepy').astype(int)

X = df[X_columns]
y = df['Binary_Target']

# Encode categorical variables
X_encoded = pd.get_dummies(X)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, stratify=y, random_state=42)

# Create and train a Logistic Regression model
logistic_regression = LogisticRegression(random_state=42)
logistic_regression.fit(X_train, y_train)

# Make predictions
y_pred = logistic_regression.predict(X_test)

# Evaluate the model
print("Accuracy on test set:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))


Accuracy on test set: 0.8194444444444444
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.87      0.88       114
           1       0.56      0.63      0.59        30

    accuracy                           0.82       144
   macro avg       0.73      0.75      0.74       144
weighted avg       0.83      0.82      0.82       144



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
