In [13]:
# Step 1: Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

# Step 2: Load the datasets
csv_url_b = 'https://raw.githubusercontent.com/zeroday-zaddy/cs422-project/main/data/leafly_effects_b.csv'
csv_url_a = 'https://raw.githubusercontent.com/zeroday-zaddy/cs422-project/main/data/leafly_effects_a.csv'
csv_url_analytical = 'https://raw.githubusercontent.com/zeroday-zaddy/cs422-project/main/data/analytical360.csv'

df_b = pd.read_csv(csv_url_b)
df_a = pd.read_csv(csv_url_a)
df_analytical = pd.read_csv(csv_url_analytical)

# Step 3: Preprocess the data
# Combine data by name (strain)
df_b.rename(columns={'name': 'Strain'}, inplace=True)
combined_df = pd.merge(df_b, df_a, on='Strain', how='outer')

# Select relevant columns for terpenes
terpene_columns = ['Terpinolene', 'alpha-Bisabolol', 'alpha-Pinene', 'alpha-Terpinene', 'beta-Caryophyllene', 'beta-Myrcene', 'beta-Pinene']
combined_df = pd.merge(combined_df, df_analytical[terpene_columns], on='Strain', how='left')

# Handling missing values
combined_df.fillna('N/A', inplace=True)

# Step 4: Split the data into training and testing sets
X = combined_df[terpene_columns[1:]]  # Features (excluding 'Strain')
y = combined_df['Effects']              # Target

# Handling missing values in the combined dataset
X.fillna(X.mean(), inplace=True)
y.fillna('N/A', inplace=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Train and evaluate Decision Tree model
dt_model = DecisionTreeRegressor()
dt_model.fit(X_train, y_train)
dt_pred = dt_model.predict(X_test)
dt_mse = mean_squared_error(y_test, dt_pred)
print(f'Decision Tree Mean Squared Error: {dt_mse}')

# Step 6: Train and evaluate Random Forest model
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_mse = mean_squared_error(y_test, rf_pred)
print(f'Random Forest Mean Squared Error: {rf_mse}')

# Step 7: Train and evaluate Neural Network model
nn_model = MLPRegressor()
nn_model.fit(X_train, y_train)
nn_pred = nn_model.predict(X_test)
nn_mse = mean_squared_error(y_test, nn_pred)
print(f'Neural Network Mean Squared Error: {nn_mse}')

# Step 8: Compare models visually
plt.scatter(y_test, dt_pred, label='Decision Tree', color='red', alpha=0.5)
plt.scatter(y_test, rf_pred, label='Random Forest', color='green', alpha=0.5)
plt.scatter(y_test, nn_pred, label='Neural Network', color='blue', alpha=0.5)
plt.xlabel('True Effects')
plt.ylabel('Predicted Effects')
plt.title('Comparison of Model Predictions')
plt.legend()
plt.show()


KeyError: 'Strain'