In [None]:
# Importing preprocessed dataset
import pandas as pd
df = pd.read_csv('cleaned_data_filtered.csv')

In [None]:
print(df.shape)

In [None]:
df.describe()

In [None]:
df.head()

In [None]:
df = df.drop(columns=['name'])

In [None]:
df.head()

In [None]:
#Separating features X from labels y
y = df['calories'].to_numpy()
X = df.drop(columns=['calories']).to_numpy()

In [None]:
# Scaling the data and dividing dataset into training and testing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled,y, test_size = 0.2)

In [None]:
from sklearn.linear_model import LinearRegression
LR = LinearRegression()
LR.fit(X_train, y_train)
LR.score(X_test, y_test) #testing accuracy

In [None]:
LR.score(X_train, y_train) #training accuracy

In [None]:
#X_test = scaler.transform(X)
y_pred = LR.predict(X_test)
y_pred

In [None]:
print(y)

In [None]:
print(X)

In [None]:
import numpy as np

# Creating a sample input
sample_input = np.array([22, 32, 48, 39, 27, 5]) #This is the same as the third sample, so we're expecting something close to 269.8 calories

# Reshaping (1, num_features)
sample_input = sample_input.reshape(1, -1)

sample_input_scaled = scaler.transform(sample_input)  # Use the same scaler from training

# Making prediction
predicted_value = LR.predict(sample_input_scaled)

print("Predicted Calories:", predicted_value)

In [None]:
# Plotting linear regression between each nutrtional value and the calories
from matplotlib import pyplot as plt
for i in range(6):
    feature_index = i
    
    # Extract the corresponding feature values
    X_train_feature = X_train[:, feature_index].reshape(-1, 1)
    X_test_feature = X_test[:, feature_index].reshape(-1, 1)
    
    
    # Fitting model for visualization using only one feature
    LR_single = LinearRegression()
    LR_single.fit(X_train_feature, y_train)
    
    y_pred_single = LR_single.predict(X_test_feature)
    
    # Plotting the actual data points
    plt.scatter(X_test_feature, y_test, color='b', label="Actual data")
    
    # Plotting the regression line
    plt.plot(X_test_feature, y_pred_single, color='k', linewidth=2, label="Regression line")

    # Getting the feature names from the data
    feature_names = df.drop(columns=['calories']).columns
    # Choosing a feature index for visualization
    feature_name = feature_names[feature_index]
    
    plt.xlabel(feature_name)
    plt.ylabel("Calories")
    plt.legend()
    plt.title(f"Linear Regression on {feature_name}")
    
    plt.show()

In [None]:
# Plotting linear regression between each nutrtional value and the calories
from matplotlib import pyplot as plt
for i in range(6):
    feature_index = i
    
    # Splitting using single feature
    X_single = X[:, feature_index].reshape(-1, 1)
    single_scaler = StandardScaler()
    X_single_scaled = single_scaler.fit_transform(X_single)
    X_train_single, X_test_single, y_train_single, y_test_single = train_test_split(X_single_scaled,y, test_size = 0.2)
    
    # Fitting model using only one feature
    LR_single = LinearRegression()
    LR_single.fit(X_train_single, y_train_single)
    
    y_pred_single = LR_single.predict(X_test_single)
    
    # Plotting the actual data points
    plt.scatter(X_test_single, y_test_single, color='b', label="Actual data")
    
    # Plotting the regression line
    plt.plot(X_test_single, y_pred_single, color='k', linewidth=2, label="Regression line")

    # Getting the feature names from the data
    feature_names = df.drop(columns=['calories']).columns
    feature_name = feature_names[feature_index]
    
    plt.xlabel(feature_name)
    plt.ylabel("Calories")
    plt.legend()
    plt.title(f"Linear Regression on {feature_name}")
    plt.show()

In [None]:
# Evaluation metrics
from sklearn.metrics import mean_absolute_error,mean_squared_error,root_mean_squared_error,r2_score
mae = mean_absolute_error(y_test, y_pred) 
mse = mean_squared_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred) 
r2 = r2_score(y_test, y_pred) 
  
print("MAE:", mae) 
print("MSE:", mse) 
print("RMSE:", rmse)
print("r2:", r2)

In [None]:
#cross validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(LR, X, y, cv=5) #X and y will be split into folds
(scores)

In [None]:
from sklearn.model_selection import cross_validate #more comprehensive: gives time, can show training scores, can show results for multiple metrics
scores = cross_validate(LR, X, y, cv=5, return_train_score= True, scoring=('r2', 'neg_root_mean_squared_error','neg_mean_absolute_error','neg_mean_squared_error')) #X and y will be split into folds
print(scores)

In [None]:
scores_df = pd.DataFrame(scores)
summary_df = scores_df.mean().to_frame(name="Mean")
summary_df["Std Dev"] = scores_df.std()
print(summary_df)