In [15]:
import pandas as pd
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report

# Load and preprocess data
csv_url = 'https://raw.githubusercontent.com/zeroday-zaddy/cs422-project/c067b005a7f9a90ae114357cfe7948ed828dc07a/data/leafly_strain_data.csv'
warnings.filterwarnings('ignore')
df = pd.read_csv(csv_url)

# Drop rows where 'name' is blank
df = df.dropna(subset=['name'])

# Handling missing values
df[['happy', 'euphoric', 'uplifted', 'sleepy', 'dry_mouth', 'dry_eyes', 'dizzy', 'paranoid', 'anxious', 'stress', 'pain', 'depression', 'anxiety', 'insomnia']] = df[['happy', 'euphoric', 'uplifted', 'sleepy', 'dry_mouth', 'dry_eyes', 'dizzy', 'paranoid', 'anxious', 'stress', 'pain', 'depression', 'anxiety', 'insomnia']].fillna(0)

# Extract features (X) and target variables (y)
X = df[['most_common_terpene']]
y = df[['relaxed', 'happy', 'euphoric', 'uplifted', 'sleepy', 'dry_mouth', 'dry_eyes', 'dizzy', 'paranoid', 'anxious', 'stress', 'pain', 'depression', 'anxiety', 'insomnia']]

# Encode categorical variables
encoder = OneHotEncoder(sparse_output=False)
X_encoded = encoder.fit_transform(X[['most_common_terpene']])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Train a machine learning model using MultiOutputClassifier
base_classifier = DecisionTreeClassifier(random_state=42)
model = MultiOutputClassifier(base_classifier)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate model performance for each target variable
for i, effect in enumerate(y.columns):
    print(f"Effect: {effect}")
    print(classification_report(y_test[effect], y_pred[:, i], zero_division='warn'))
    print("=" * 50)


Effect: relaxed
              precision    recall  f1-score   support

          0%       0.44      1.00      0.61       418
        100%       0.00      0.00      0.00        68
         12%       0.00      0.00      0.00         1
         16%       0.00      0.00      0.00         1
         18%       0.00      0.00      0.00         2
         20%       0.00      0.00      0.00         1
         22%       0.00      0.00      0.00         3
         23%       0.00      0.00      0.00         1
         25%       0.00      0.00      0.00         7
         27%       0.00      0.00      0.00         2
         28%       0.00      0.00      0.00         3
         30%       0.00      0.00      0.00         1
         31%       0.00      0.00      0.00         1
         32%       0.00      0.00      0.00         1
         33%       0.00      0.00      0.00        10
         35%       0.00      0.00      0.00         3
         36%       0.00      0.00      0.00         1
         37

#Binary Classification using strains_cleaned.csv


In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, accuracy_score

# Load and preprocess data
csv_path = 'https://raw.githubusercontent.com/zeroday-zaddy/cs422-project/main/data/strains_cleaned.csv'
df = pd.read_csv(csv_path)

# Replace NaN values with a placeholder (e.g., 'Unknown') in specific columns
df['Main_Effect'] = df['Main_Effect'].fillna('Unknown')
df['Terpene'] = df['Terpene'].fillna('Unknown')

# Drop rows with the placeholder
df = df[df['Main_Effect'] != 'Unknown']
df = df[df['Terpene'] != 'Unknown']

# Save the cleaned DataFrame to a new CSV file
df.to_csv('cleaned_file2.csv', index=False)

# Convert the 'Terpene' column to a binary representation using one-hot encoding
mlb = MultiLabelBinarizer()
X_encoded = mlb.fit_transform(df['Terpene'].str.split(','))

# Create a new dataframe with the one-hot encoded Terpene column and the target variable (Main_Effect)
df_encoded = pd.concat([pd.DataFrame(X_encoded, columns=mlb.classes_), df['Main_Effect']], axis=1)

print(df_encoded)









      Caryophyllene  Humulene  Limonene  Linalool  Myrcene  Ocimene  Pinene  \
0               1.0       0.0       0.0       0.0      0.0      0.0     0.0   
1               1.0       0.0       0.0       0.0      0.0      0.0     0.0   
2               0.0       0.0       0.0       0.0      1.0      0.0     0.0   
3               1.0       0.0       0.0       0.0      0.0      0.0     0.0   
4               0.0       0.0       1.0       0.0      0.0      0.0     0.0   
...             ...       ...       ...       ...      ...      ...     ...   
5071            NaN       NaN       NaN       NaN      NaN      NaN     NaN   
5081            NaN       NaN       NaN       NaN      NaN      NaN     NaN   
5087            NaN       NaN       NaN       NaN      NaN      NaN     NaN   
5097            NaN       NaN       NaN       NaN      NaN      NaN     NaN   
5105            NaN       NaN       NaN       NaN      NaN      NaN     NaN   

      Terpinolene Main_Effect  
0             0.0  

#Train model and make prediction


In [None]:
# Convert the 'Terpene' column to a binary representation using one-hot encoding
mlb = MultiLabelBinarizer()
X_encoded = mlb.fit_transform(df['Terpene'].str.split(','))

# Create a new dataframe with the one-hot encoded Terpene column and the target variable (Main_Effect)
df_encoded = pd.concat([pd.DataFrame(X_encoded, columns=mlb.classes_), df['Main_Effect']], axis=1)

# Extract features (X) and target variable (y)
X = df_encoded.drop('Main_Effect', axis=1)
y = df_encoded['Main_Effect']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a machine learning model for each effect using RandomForestClassifier
models = {}
effect_predictions = {}

for effect in mlb.classes_:
    model = RandomForestClassifier(random_state=42)
    y_train_effect = (y_train == effect).astype(int)
    X_train_effect = X_train  # Use the original X_train without imputation
    model.fit(X_train_effect, y_train_effect)
    models[effect] = model

    # Make predictions for the test set
    y_pred_effect = model.predict(X_test)
    effect_predictions[effect] = y_pred_effect

# Train an overall model using RandomForestClassifier
overall_model = RandomForestClassifier(random_state=42)
X_train_overall = X_train  # Use the original X_train without imputation
y_train_overall = y_train  # Use the original y_train without imputation
overall_model.fit(X_train_overall, y_train_overall)

# Make predictions for the overall test set
y_pred_overall = overall_model.predict(X_test)

# Print predictions for each effect
print("Predictions for Each Effect:")
for effect in mlb.classes_:
    accuracy = accuracy_score((y_test == effect).astype(int), effect_predictions[effect])
    print(f"Effect: {effect}, Accuracy: {accuracy}")
    print(classification_report((y_test == effect).astype(int), effect_predictions[effect]))
    print("=" * 50)

# Print overall predictions
print("Overall Predictions:")
overall_accuracy = accuracy_score(y_test, y_pred_overall)
print(f"Overall Accuracy: {overall_accuracy}")
print(classification_report(y_test, y_pred_overall))

#Model based on professors example of Random Forest

In [36]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

# Load your marijuana project data from a CSV file
# Replace 'marijuana_data.csv' with the actual path to your CSV file
df = pd.read_csv('https://raw.githubusercontent.com/zeroday-zaddy/cs422-project/main/data/strains_cleaned.csv')


# Drop rows with missing values
df = df.dropna()

# Assuming 'Target' is the name of your target column
X = df['Terpene']
y = df['Main_Effect']

# One-hot encode categorical columns in X
# Replace 'categorical_columns' with the actual names of your categorical columns
categorical_columns = ['Main_Effect', 'Terpene']
X_encoded = pd.get_dummies(X, columns=categorical_columns)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, stratify=y, random_state=42)

# Create and train the Decision Tree Classifier
tree = DecisionTreeClassifier(random_state=0)
tree.fit(X_train, y_train)

print("Accuracy on training set: {:.3f}".format(tree.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(tree.score(X_test, y_test)))

Accuracy on training set: 0.226
Accuracy on test set: 0.243
