In [1]:
#!pip install numpy -U pandas-profiling matplotlib seaborn klib dtale scikit-learn joblib

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
df_train=pd.read_csv(r'C:\Users\mahar\OneDrive\Desktop\jupyter\archive\train.csv')
df_test=pd.read_csv(r'C:\Users\mahar\OneDrive\Desktop\jupyter\archive\test.csv')

In [4]:
df_train

In [5]:
#df_test

In [6]:
df_train.shape

In [7]:
df_train.isnull().sum()

In [8]:
df_test.isnull().sum()

In [9]:
df_test.shape

In [10]:
df_train.info()

In [11]:
df_train.describe()

In [12]:
#Item_Weight is numerical column so we fill it with Mean Imputation

In [13]:
df_train['Item_Weight'].describe()

In [14]:
df_train['Item_Weight'].fillna(df_train['Item_Weight'].mean(),inplace=True)
df_test['Item_Weight'].fillna(df_test['Item_Weight'].mean(),inplace=True)

In [15]:
df_test.isnull().sum()

In [16]:
df_train['Item_Weight'].describe()

In [17]:
#Outlet_Size is catagorical column so we fill it with Mode Imputation

In [18]:
df_train['Outlet_Size'].describe()

In [19]:
df_train['Outlet_Size']

In [20]:
df_train['Outlet_Size'].value_counts()

In [21]:
df_train['Outlet_Size'].mode()

In [22]:
df_train['Outlet_Size'].fillna(df_train['Outlet_Size'].mode()[0],inplace=True)
df_test['Outlet_Size'].fillna(df_test['Outlet_Size'].mode()[0],inplace=True)

In [23]:
df_test.isnull().sum()

# Selecting Features Based On Requirements

In [24]:
df_train.drop(['Item_Identifier','Outlet_Identifier'],axis=1,inplace=True)
df_test.drop(['Item_Identifier','Outlet_Identifier'],axis=1,inplace=True)

In [25]:
df_train

# EDA (Exploratory Data Analysis) 

### >>EDA Using Dtale lib

In [26]:
import dtale

In [27]:
dtale.show(df_train)

### >>Checking Correlation Using Seaborn

In [28]:
plt.figure(figsize=(10,5))
sns.heatmap(df_train.corr(),annot=True)
plt.show()

### >>EDA using klib library

In [29]:
import klib
#it tells about visualizing datasets

In [30]:
klib.cat_plot(df_train)

In [31]:
klib.corr_mat(df_train)

In [32]:
klib.corr_plot(df_train)

In [33]:
klib.dist_plot(df_train)

# Data Cleaning Using Klib Llibrary

In [34]:
 #klib.clean - functions for cleaning datasets

In [35]:
klib.data_cleaning(df_train) # performs datacleaning (drop duplicates & empty rows/cols, adjust dtypes,...)

In [36]:
klib.clean_column_names(df_train) # cleans and standardizes column names, also called inside data_cleaning()

In [37]:
df_train=klib.convert_datatypes(df_train) # converts existing to more efficient dtypes, also called inside data_cleaning()


# Preprocessing task before Model Building 

### >>Label Encoding

In [38]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()

In [39]:
df_train=df_train.apply(le.fit_transform)

In [40]:
df_train

### >>One Hot Encoding

In [41]:
df_train =pd.get_dummies(df_train,columns=['item_fat_content','outlet_size','outlet_location_type','outlet_type'])

In [42]:
df_train

# Splitting Data Into Training And Testing

In [43]:
X=df_train.drop('item_outlet_sales',axis=1)

In [44]:
Y=df_train['item_outlet_sales']

In [45]:
Y

In [46]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y)

# Standarization

In [47]:
X.describe()

In [48]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()

In [49]:
X_train_std=sc.fit_transform(X_train)

In [50]:
X_test_std=sc.transform(X_test)

In [51]:
X_train_std

In [52]:
X_test_std

In [53]:
Y_train

In [54]:
Y_test

# Model Building

In [55]:
from sklearn.linear_model import LinearRegression
lr=LinearRegression()

In [56]:
lr.fit(X_train_std,Y_train)

In [57]:
lr.predict(X_test_std)

In [58]:
Y_test

In [59]:
Y_predict_lr=lr.predict(X_test_std)

In [60]:
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [61]:
r2_score(Y_test,Y_predict_lr)


In [62]:
print(np.sqrt(mean_squared_error(Y_test,Y_predict_lr)))

In [63]:
mean_absolute_error(Y_test,Y_predict_lr)

In [64]:
!pip install sklearn --upgrade

In [65]:
!pip install joblib --upgrade
import joblib

In [66]:
joblib.dump(lr,r'C:\Users\mahar\OneDrive\Desktop\jupyter\archive\lr.sav')

In [79]:
from sklearn.ensemble import RandomForestRegressor
rf= RandomForestRegressor()

In [80]:
rf.fit(X_train_std,Y_train)

In [81]:
Y_pred_rf= rf.predict(X_test_std)

In [82]:
print(r2_score(Y_test,Y_pred_rf))
print(mean_absolute_error(Y_test,Y_pred_rf))
print(np.sqrt(mean_squared_error(Y_test,Y_pred_rf)))

# Hyper Parameter Tuning

In [83]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

# define models and parameters
model = RandomForestRegressor()
n_estimators = [10, 100, 1000]
max_depth=range(1,31)
min_samples_leaf=np.linspace(0.1, 1.0)
max_features=["auto", "sqrt", "log2"]
min_samples_split=np.linspace(0.1, 1.0, 10)

# define grid search
grid = dict(n_estimators=n_estimators)

#cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=101)

grid_search_forest = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, 
                           scoring='r2',error_score=0,verbose=2,cv=2)

grid_search_forest.fit(X_train_std, Y_train)

# summarize results
print(f"Best: {grid_search_forest.best_score_:.3f} using {grid_search_forest.best_params_}")
means = grid_search_forest.cv_results_['mean_test_score']
stds = grid_search_forest.cv_results_['std_test_score']
params = grid_search_forest.cv_results_['params']

for mean, stdev, param in zip(means, stds, params):
    print(f"{mean:.3f} ({stdev:.3f}) with: {param}")


In [84]:
grid_search_forest.best_params_

In [85]:
grid_search_forest.best_score_

In [86]:
Y_pred_rf_grid=grid_search_forest.predict(X_test_std)

In [87]:
r2_score(Y_test,Y_pred_rf_grid)

# Save the Model

In [88]:
import joblib

In [89]:
joblib.dump(grid_search_forest,r'C:\Users\mahar\OneDrive\Desktop\jupyter\archive\random_forest_grid.sav')

In [90]:
model=joblib.load(r'C:\Users\mahar\OneDrive\Desktop\jupyter\archive\random_forest_grid.sav')