<a href="https://colab.research.google.com/github/thenameisAnurag/BigData/blob/main/Big%20Market%20Sales.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load train and test data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Display train and test data
print("Train data:\n", train.head())
print("\nTest data:\n", test.head())

# Check the shape of train and test data
print("Train shape:", train.shape)
print("Test shape:", test.shape)

# Check for missing values in train data
print("Missing values in train data:\n", train.isnull().sum())

# Data preparation function
def data_prep(train):
    train['Item_Weight'] = np.where(train['Item_Weight'].isna(), train['Item_Weight'].median(skipna=True), train['Item_Weight'])
    train['Outlet_Size'] = np.where(train['Outlet_Size'].isna(), train['Outlet_Size'].mode()[0], train['Outlet_Size'])
    train['Item_Fat_Content'] = train['Item_Fat_Content'].replace(['low fat', 'LF'], 'Low Fat').replace('reg', 'Regular')
    train['YOB'] = 2024 - train['Outlet_Establishment_Year']
    return train

# Prepare train data
train_new = data_prep(train)

# Check for missing values after data preparation
print("Missing values in train_new data:\n", train_new.isnull().sum())

# Drop unnecessary columns from train_new
train_new.drop(['Item_Identifier', 'Item_Fat_Content', 'Item_Type', 'Outlet_Identifier', 'Outlet_Establishment_Year', 'Outlet_Type', 'Outlet_Location_Type', 'Outlet_Size'], inplace=True, axis=1)

# Split data into features and target variable
y = train_new['Item_Outlet_Sales']
x = train_new.drop(['Item_Outlet_Sales'], axis=1)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=15)

# Print shapes of train and test data
print("Train shapes:")
print(x_train.shape)
print(y_train.shape)
print("\nTest shapes:")
print(x_test.shape)
print(y_test.shape)

# Linear Regression
lr = LinearRegression()
lr.fit(x_train, y_train)
lr_train = lr.predict(x_train)
lr_test = lr.predict(x_test)

# Random Forest
rf = RandomForestRegressor()
rf.fit(x_train, y_train)
rf_preds_train = rf.predict(x_train)
rf_preds_test = rf.predict(x_test)

# Function for Model Evaluation
def model_eval(actual, predicted, model_name):
    rmse = np.sqrt(mean_squared_error(actual, predicted))
    r2 = r2_score(actual, predicted)
    print(f'{model_name} Model Evaluation:')
    print('RMSE:', round(rmse, 2))
    print('R2 Score:', round(r2, 2))

# Model Evaluation
model_eval(y_train, lr_train, "Linear Regression (Train)")
model_eval(y_test, lr_test, "Linear Regression (Test)")
model_eval(y_train, rf_preds_train, "Random Forest (Train)")
model_eval(y_test, rf_preds_test, "Random Forest (Test)")

# Visualization - Scatter plot for Linear Regression
plt.figure(figsize=(12, 6))
plt.scatter(y_test, lr_test, color='blue', label='Predictions')
plt.plot(y_test, y_test, color='red', linestyle='--', label='Actual')
plt.title('Linear Regression: Actual vs Predicted')
plt.xlabel('Actual Sales')
plt.ylabel('Predicted Sales')
plt.legend()
plt.show()

# Visualization - Scatter plot for Random Forest
plt.figure(figsize=(12, 6))
plt.scatter(y_test, rf_preds_test, color='blue', label='Predictions')
plt.plot(y_test, y_test, color='red', linestyle='--', label='Actual')
plt.title('Random Forest: Actual vs Predicted')
plt.xlabel('Actual Sales')
plt.ylabel('Predicted Sales')
plt.legend()
plt.show()


FileNotFoundError: [Errno 2] No such file or directory: 'train.csv'