In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import precision_score, recall_score, accuracy_score, mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor

# Load the datasets
stores = pd.read_csv('stores.csv')
features = pd.read_csv('features.csv')
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')  # assuming this is the separate test dataset




In [None]:
# Check for missing values
print(stores.isnull().sum())
print(features.isnull().sum())
print(train.isnull().sum())
print(test.isnull().sum())

# Display basic statistics
print(stores.describe())
print(features.describe())
print(train.describe())
print(test.describe())


Store    0
Type     0
Size     0
dtype: int64
Store              0
Date               0
Temperature        0
Fuel_Price         0
MarkDown1       4158
MarkDown2       5269
MarkDown3       4577
MarkDown4       4726
MarkDown5       4140
CPI              585
Unemployment     585
IsHoliday          0
dtype: int64
Store           0
Dept            0
Date            0
Weekly_Sales    0
IsHoliday       0
dtype: int64
Store        0
Dept         0
Date         0
IsHoliday    0
dtype: int64
           Store           Size
count  45.000000      45.000000
mean   23.000000  130287.600000
std    13.133926   63825.271991
min     1.000000   34875.000000
25%    12.000000   70713.000000
50%    23.000000  126512.000000
75%    34.000000  202307.000000
max    45.000000  219622.000000
             Store  Temperature   Fuel_Price      MarkDown1      MarkDown2  \
count  8190.000000  8190.000000  8190.000000    4032.000000    2921.000000   
mean     23.000000    59.356198     3.405992    7032.371786    3384.1

In [None]:
# Identify numeric columns
numeric_cols = features.select_dtypes(include=['number']).columns

# Fill missing values for numeric columns only
features[numeric_cols] = features[numeric_cols].fillna(features[numeric_cols].mean())

# Verify there are no more missing values in numeric columns
print(features[numeric_cols].isnull().sum())
features


Store           0
Temperature     0
Fuel_Price      0
MarkDown1       0
MarkDown2       0
MarkDown3       0
MarkDown4       0
MarkDown5       0
CPI             0
Unemployment    0
dtype: int64


Unnamed: 0,Store,Date,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday
0,1,2010-02-05,42.31,2.572,7032.371786,3384.176594,1760.10018,3292.935886,4132.216422,211.096358,8.106000,False
1,1,2010-02-12,38.51,2.548,7032.371786,3384.176594,1760.10018,3292.935886,4132.216422,211.242170,8.106000,True
2,1,2010-02-19,39.93,2.514,7032.371786,3384.176594,1760.10018,3292.935886,4132.216422,211.289143,8.106000,False
3,1,2010-02-26,46.63,2.561,7032.371786,3384.176594,1760.10018,3292.935886,4132.216422,211.319643,8.106000,False
4,1,2010-03-05,46.50,2.625,7032.371786,3384.176594,1760.10018,3292.935886,4132.216422,211.350143,8.106000,False
...,...,...,...,...,...,...,...,...,...,...,...,...
8185,45,2013-06-28,76.05,3.639,4842.290000,975.030000,3.00000,2449.970000,3169.690000,172.460809,7.826821,False
8186,45,2013-07-05,77.50,3.614,9090.480000,2268.580000,582.74000,5797.470000,1514.930000,172.460809,7.826821,False
8187,45,2013-07-12,79.37,3.614,3789.940000,1827.310000,85.72000,744.840000,2150.360000,172.460809,7.826821,False
8188,45,2013-07-19,82.84,3.737,2961.490000,1047.070000,204.19000,363.000000,1059.460000,172.460809,7.826821,False


In [None]:
# Merge the dataframes for training data
train_data = pd.merge(train, stores, on='Store')
train_data = pd.merge(train_data, features, on=['Store', 'Date'])

# Merge the dataframes for test data
test_data = pd.merge(test, stores, on='Store')
test_data = pd.merge(test_data, features, on=['Store', 'Date'])

# Display the first few rows of the merged dataframes
print(train_data.head())
print(test_data.head())


   Store  Dept        Date  Weekly_Sales  IsHoliday_x Type    Size  \
0      1     1  2010-02-05      24924.50        False    A  151315   
1      1     2  2010-02-05      50605.27        False    A  151315   
2      1     3  2010-02-05      13740.12        False    A  151315   
3      1     4  2010-02-05      39954.04        False    A  151315   
4      1     5  2010-02-05      32229.38        False    A  151315   

   Temperature  Fuel_Price    MarkDown1    MarkDown2   MarkDown3    MarkDown4  \
0        42.31       2.572  7032.371786  3384.176594  1760.10018  3292.935886   
1        42.31       2.572  7032.371786  3384.176594  1760.10018  3292.935886   
2        42.31       2.572  7032.371786  3384.176594  1760.10018  3292.935886   
3        42.31       2.572  7032.371786  3384.176594  1760.10018  3292.935886   
4        42.31       2.572  7032.371786  3384.176594  1760.10018  3292.935886   

     MarkDown5         CPI  Unemployment  IsHoliday_y  
0  4132.216422  211.096358         8

In [None]:
# Convert date to datetime
train_data['Date'] = pd.to_datetime(train_data['Date'])
test_data['Date'] = pd.to_datetime(test_data['Date'])

# Extract year, month, and day from date
train_data['Year'] = train_data['Date'].dt.year
train_data['Month'] = train_data['Date'].dt.month
train_data['Day'] = train_data['Date'].dt.day

test_data['Year'] = test_data['Date'].dt.year
test_data['Month'] = test_data['Date'].dt.month
test_data['Day'] = test_data['Date'].dt.day

# Convert categorical variables to dummy/indicator variables
train_data = pd.get_dummies(train_data, columns=['Type'], drop_first=True)
test_data = pd.get_dummies(test_data, columns=['Type'], drop_first=True)

print(train_data.head())
print(test_data.head())


   Store  Dept       Date  Weekly_Sales  IsHoliday_x    Size  Temperature  \
0      1     1 2010-02-05      24924.50        False  151315        42.31   
1      1     2 2010-02-05      50605.27        False  151315        42.31   
2      1     3 2010-02-05      13740.12        False  151315        42.31   
3      1     4 2010-02-05      39954.04        False  151315        42.31   
4      1     5 2010-02-05      32229.38        False  151315        42.31   

   Fuel_Price    MarkDown1    MarkDown2  ...    MarkDown4    MarkDown5  \
0       2.572  7032.371786  3384.176594  ...  3292.935886  4132.216422   
1       2.572  7032.371786  3384.176594  ...  3292.935886  4132.216422   
2       2.572  7032.371786  3384.176594  ...  3292.935886  4132.216422   
3       2.572  7032.371786  3384.176594  ...  3292.935886  4132.216422   
4       2.572  7032.371786  3384.176594  ...  3292.935886  4132.216422   

          CPI  Unemployment  IsHoliday_y  Year  Month  Day  Type_B  Type_C  
0  211.096358  

In [None]:
# Check the columns in the merged datasets
print(train_data.columns)
print(test_data.columns)


Index(['Store', 'Dept', 'Date', 'Weekly_Sales', 'IsHoliday_x', 'Size',
       'Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3',
       'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment', 'IsHoliday_y', 'Year',
       'Month', 'Day', 'Type_B', 'Type_C'],
      dtype='object')
Index(['Store', 'Dept', 'Date', 'IsHoliday_x', 'Size', 'Temperature',
       'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4',
       'MarkDown5', 'CPI', 'Unemployment', 'IsHoliday_y', 'Year', 'Month',
       'Day', 'Type_B', 'Type_C'],
      dtype='object')


In [None]:
# Define the target variable
y_train = train_data['Weekly_Sales']

# Drop unnecessary columns
columns_to_drop_train = ['Weekly_Sales', 'Date', 'IsHoliday_x', 'IsHoliday_y']
columns_to_drop_test = ['Date', 'IsHoliday_x', 'IsHoliday_y']

# Define the features
X_train = train_data.drop(columns=columns_to_drop_train)
X_test = test_data.drop(columns=columns_to_drop_test, errors='ignore')

print(X_train.head())
print(X_test.head())
print(y_train.head())


   Store  Dept    Size  Temperature  Fuel_Price    MarkDown1    MarkDown2  \
0      1     1  151315        42.31       2.572  7032.371786  3384.176594   
1      1     2  151315        42.31       2.572  7032.371786  3384.176594   
2      1     3  151315        42.31       2.572  7032.371786  3384.176594   
3      1     4  151315        42.31       2.572  7032.371786  3384.176594   
4      1     5  151315        42.31       2.572  7032.371786  3384.176594   

    MarkDown3    MarkDown4    MarkDown5         CPI  Unemployment  Year  \
0  1760.10018  3292.935886  4132.216422  211.096358         8.106  2010   
1  1760.10018  3292.935886  4132.216422  211.096358         8.106  2010   
2  1760.10018  3292.935886  4132.216422  211.096358         8.106  2010   
3  1760.10018  3292.935886  4132.216422  211.096358         8.106  2010   
4  1760.10018  3292.935886  4132.216422  211.096358         8.106  2010   

   Month  Day  Type_B  Type_C  
0      2    5   False   False  
1      2    5   False 

In [None]:
from sklearn.linear_model import LinearRegression

# Initialize the model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)


In [None]:
# Make predictions on the test data
y_test_pred = model.predict(X_test)



In [None]:
# Save predictions to a CSV file
test_data['Predicted_Weekly_Sales'] = y_test_pred
test_data[['Store', 'Dept', 'Date', 'Predicted_Weekly_Sales']].to_csv('predicted_sales.csv', index=False)


In [None]:
# Initialize a different model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions on the test data
rf_y_test_pred = rf_model.predict(X_test)

# Save predictions to a CSV file
test_data['Predicted_Weekly_Sales_RF'] = rf_y_test_pred
test_data[['Store', 'Dept', 'Date', 'Predicted_Weekly_Sales_RF']].to_csv('predicted_sales_rf.csv', index=False)


In [None]:
# Define a threshold for classification
threshold = 5000  # Example threshold value

# Assuming you have the true values for test data, y_test
# For demonstration purposes, let's generate dummy y_test
y_test = np.random.randint(0, 20000, size=len(X_test))  # Replace this with your actual y_test values

# Convert regression output to binary classification
y_test_class = (y_test > threshold).astype(int)
rf_y_test_pred_class = (rf_y_test_pred > threshold).astype(int)

# Calculate precision, recall, and accuracy
precision = precision_score(y_test_class, rf_y_test_pred_class)
recall = recall_score(y_test_class, rf_y_test_pred_class)
accuracy = accuracy_score(y_test_class, rf_y_test_pred_class)

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'Accuracy: {accuracy}')

# Calculate regression metrics
mae = mean_absolute_error(y_test, rf_y_test_pred)
mse = mean_squared_error(y_test, rf_y_test_pred)
r2 = r2_score(y_test, rf_y_test_pred)

print(f'Mean Absolute Error (MAE): {mae}')
print(f'Mean Squared Error (MSE): {mse}')
print(f'R-squared (R²): {r2}')


Precision: 0.7492056645819855
Recall: 0.6049640697012968
Accuracy: 0.5526663422095529
Mean Absolute Error (MAE): 14679.80865925659
Mean Squared Error (MSE): 621596633.5974771
R-squared (R²): -17.630109034036582
