In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

#### Read the File

In [2]:
df=pd.read_csv('sales_data_sample.csv',encoding='unicode_escape')

In [3]:
features=['MONTH_ID', 'YEAR_ID', 'PRODUCTLINE', 'MSRP', 'COUNTRY', 'DEALSIZE']

x=pd.get_dummies(df[features]) # Convert into numerical format
y=df['SALES']

def get_season(month):
    if month in [12, 1, 2]: return 'Winter'
    elif month in [3, 4, 5]: return 'Spring'
    elif month in [6, 7, 8]: return 'Summer'
    else: return 'Fall'

df['SEASON'] = df['MONTH_ID'].apply(get_season) #adding a new season col

#### Training the Model

In [4]:
x_train,x_test,y_train,y_test= train_test_split(x,y,test_size=0.2,random_state=42)

model = RandomForestRegressor(n_estimators=100,random_state=42)
model.fit(x_train, y_train)

y_pred = model.predict(x_test)
results = pd.DataFrame({
    'Season': df.loc[y_test.index, 'SEASON'],
    'Actual Value': y_test,
    'Predicted Value': y_pred})
print(results.head(10).to_string(index=False)) # index false is used to remove the index 

print("\n Average Values")
avg=results.groupby('Season')[['Actual Value','Predicted Value']].mean().round(2).reset_index()
print(avg.to_string(index=False))

Season  Actual Value  Predicted Value
Winter       2231.00        2339.9187
Winter       4301.22        4393.9514
Summer       1585.36        1149.2981
Summer       2526.51        2765.5781
Winter       4187.22        4165.0682
Spring       7083.00        7588.7339
Summer       3138.34        4047.0626
Summer       3361.20        3982.3422
  Fall       3988.60        3695.4472
Winter       1820.01        2249.2340

 Average Values
Season  Actual Value  Predicted Value
  Fall       3478.15          3493.92
Spring       3764.82          3758.31
Summer       3600.38          3605.28
Winter       3430.01          3522.68


#### Checking the Accuracy

In [5]:
# MAE Mean Absolute Error is used to tell how off is the prediction made by the model
# r2 determines how well the model understands the data

mae = mean_absolute_error(y_test, y_pred) 
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): ${mae:.2f}")
print(f"R-squared (R2) Score: {r2:.2f}")

# We get MAE score of 663 which is good becoz avg transaction is b/w 2000 to 5000 so the model is only off by 663
# R2 score of 079 is good it means it understands 79 percent of the data 

Mean Absolute Error (MAE): $663.46
R-squared (R2) Score: 0.79
