##SIMPLE LINEAR REGRESSION

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df=pd.read_excel("/content/SuperStoreUS-2015.xlsx")
df.head()

Unnamed: 0,Row ID,Order Priority,Discount,Unit Price,Shipping Cost,Customer ID,Customer Name,Ship Mode,Customer Segment,Product Category,...,Region,State or Province,City,Postal Code,Order Date,Ship Date,Profit,Quantity ordered new,Sales,Order ID
0,20847,High,0.01,2.84,0.93,3,Bonnie Potter,Express Air,Corporate,Office Supplies,...,West,Washington,Anacortes,98221,2015-01-07,2015-01-08,4.56,4,13.01,88522
1,20228,Not Specified,0.02,500.98,26.0,5,Ronnie Proctor,Delivery Truck,Home Office,Furniture,...,West,California,San Gabriel,91776,2015-06-13,2015-06-15,4390.3665,12,6362.85,90193
2,21776,Critical,0.06,9.48,7.29,11,Marcus Dunlap,Regular Air,Home Office,Furniture,...,East,New Jersey,Roselle,7203,2015-02-15,2015-02-17,-53.8096,22,211.15,90192
3,24844,Medium,0.09,78.69,19.99,14,Gwendolyn F Tyson,Regular Air,Small Business,Furniture,...,Central,Minnesota,Prior Lake,55372,2015-05-12,2015-05-14,803.4705,16,1164.45,86838
4,24846,Medium,0.08,3.28,2.31,14,Gwendolyn F Tyson,Regular Air,Small Business,Office Supplies,...,Central,Minnesota,Prior Lake,55372,2015-05-12,2015-05-13,-24.03,7,22.23,86838


Data PreProcessing

In [None]:
#Drop unwanted columns
df = df.drop(['Row ID','Customer ID','Customer Name','Postal Code','Order ID','Region'],axis=1)

In [None]:
#Change columns to datetime
df['Order Date']=pd.to_datetime(df['Order Date'])
df['Ship Date']=pd.to_datetime(df['Ship Date'])

In [None]:
#Extract month and year
df['Order Month']=df['Order Date'].dt.month
df['Order Year']=df['Order Date'].dt.year

In [None]:
#Displaying updated columns
df.columns

Index(['Order Priority', 'Discount', 'Unit Price', 'Shipping Cost',
       'Ship Mode', 'Customer Segment', 'Product Category',
       'Product Sub-Category', 'Product Container', 'Product Name',
       'Product Base Margin', 'Country', 'State or Province', 'City',
       'Order Date', 'Ship Date', 'Profit', 'Quantity ordered new', 'Sales',
       'Order Month', 'Order Year'],
      dtype='object')

In [None]:
#Label Encoding - converting categorical values into numerical
def encode_columns(column_name):
  from sklearn.preprocessing import LabelEncoder
  encod = LabelEncoder()
  df[column_name+'Encoded']=encod.fit_transform(df[column_name])

In [None]:
#List of features that need to be label encoded:
l = ['Order Priority','Ship Mode', 'Customer Segment', 'Product Category','Product Sub-Category', 'Product Container', 'Product Name','Country', 'State or Province', 'City']
for i in range (0,len(l)):
  encode_columns(l[i])

In [None]:
df.head()

Unnamed: 0,Order Priority,Discount,Unit Price,Shipping Cost,Ship Mode,Customer Segment,Product Category,Product Sub-Category,Product Container,Product Name,...,Order PriorityEncoded,Ship ModeEncoded,Customer SegmentEncoded,Product CategoryEncoded,Product Sub-CategoryEncoded,Product ContainerEncoded,Product NameEncoded,CountryEncoded,State or ProvinceEncoded,CityEncoded
0,High,0.01,2.84,0.93,Express Air,Corporate,Office Supplies,Pens & Art Supplies,Wrap Bag,SANFORD Liquid Accent™ Tank-Style Highlighters,...,2,1,1,1,11,6,665,0,45,16
1,Not Specified,0.02,500.98,26.0,Delivery Truck,Home Office,Furniture,Chairs & Chairmats,Jumbo Drum,Global Troy™ Executive Leather Low-Back Tilter,...,5,0,2,0,3,1,408,0,3,691
2,Critical,0.06,9.48,7.29,Regular Air,Home Office,Furniture,Office Furnishings,Small Pack,"DAX Two-Tone Rosewood/Black Document Frame, De...",...,0,2,2,0,8,5,257,0,28,666
3,Medium,0.09,78.69,19.99,Regular Air,Small Business,Furniture,Office Furnishings,Small Box,Howard Miller 12-3/4 Diameter Accuwave DS ™ Wa...,...,4,2,3,0,8,4,469,0,21,626
4,Medium,0.08,3.28,2.31,Regular Air,Small Business,Office Supplies,Pens & Art Supplies,Wrap Bag,Newell 321,...,4,2,3,1,11,6,572,0,21,626


In [None]:
df.isnull().sum()

Order Priority                  0
Discount                        0
Unit Price                      0
Shipping Cost                   0
Ship Mode                       0
Customer Segment                0
Product Category                0
Product Sub-Category            0
Product Container               0
Product Name                    0
Product Base Margin            16
Country                         0
State or Province               0
City                            0
Order Date                      0
Ship Date                       0
Profit                          0
Quantity ordered new            0
Sales                           0
Order Month                     0
Order Year                      0
Order PriorityEncoded           0
Ship ModeEncoded                0
Customer SegmentEncoded         0
Product CategoryEncoded         0
Product Sub-CategoryEncoded     0
Product ContainerEncoded        0
Product NameEncoded             0
CountryEncoded                  0
State or Provi

LINEAR REGRESSION IMPUTATION

In [None]:
from sklearn.linear_model import LinearRegression

#Separate rows without missing values
df_temp = df[df['Product Base Margin'].notna()]

#Product Base Margin coumn values:
target = np.array(df_temp['Product Base Margin'])

#Product name column values for training
feature = np.array(df_temp['Product NameEncoded'])

#Reshape - each variable is holding only a single feature
target = target.reshape(-1,1)
feature = feature.reshape(-1,1)

#Create a linear regression model
model = LinearRegression()

#Fit the model
model.fit(feature,target)

#Predict Product Base Margin using Product Name values
predicted_values = model.predict(np.array(df['Product NameEncoded']).reshape(-1,1))

#Fill missing values with predicted values
df['Product Base Margin'] = predicted_values
print(df['Product Base Margin'])

0       0.497349
1       0.518706
2       0.531255
3       0.513637
4       0.505077
          ...   
1947    0.531754
1948    0.537488
1949    0.492944
1950    0.528097
1951    0.489786
Name: Product Base Margin, Length: 1952, dtype: float64


FEATURE SELECTION

In [None]:
X=df.drop(['Profit','Order Priority','Ship Mode', 'Customer Segment', 'Product Category','Product Sub-Category', 'Product Container', 'Product Name','Country', 'State or Province', 'City','Order Month','Order Date','Ship Date'],axis=1)
y=df['Profit']
print(X)
print(y)
y.info()

      Discount  Unit Price  Shipping Cost  Product Base Margin  \
0         0.01        2.84           0.93             0.497349   
1         0.02      500.98          26.00             0.518706   
2         0.06        9.48           7.29             0.531255   
3         0.09       78.69          19.99             0.513637   
4         0.08        3.28           2.31             0.505077   
...        ...         ...            ...                  ...   
1947      0.01       10.90           7.46             0.531754   
1948      0.10        7.99           5.03             0.537488   
1949      0.08       11.97           5.81             0.492944   
1950      0.10        9.38           4.93             0.528097   
1951      0.10      105.98          13.99             0.489786   

      Quantity ordered new    Sales  Order Year  Order PriorityEncoded  \
0                        4    13.01        2015                      2   
1                       12  6362.85        2015            

In [None]:
from sklearn.datasets import make_regression
from sklearn.feature_selection import SelectKBest,chi2,f_regression

# configure to select all features
# f_regression - uses forlinear regression to select the features
# k - number of features needs to be selected
fs = SelectKBest(score_func=f_regression, k=15)

# learn relationship from training data
z=fs.fit_transform(X, y)

# returns a Boolean array indicating which features were selected by the algorithm
support = fs.get_support()

#Store the feature names
features = np.array(X.columns)

print("All features:")
print(features)

print("Selected best:")
print(features[support])
print(z)

All features:
['Discount' 'Unit Price' 'Shipping Cost' 'Product Base Margin'
 'Quantity ordered new' 'Sales' 'Order Year' 'Order PriorityEncoded'
 'Ship ModeEncoded' 'Customer SegmentEncoded' 'Product CategoryEncoded'
 'Product Sub-CategoryEncoded' 'Product ContainerEncoded'
 'Product NameEncoded' 'CountryEncoded' 'State or ProvinceEncoded'
 'CityEncoded']
Selected best:
['Discount' 'Unit Price' 'Shipping Cost' 'Product Base Margin'
 'Quantity ordered new' 'Sales' 'Order PriorityEncoded' 'Ship ModeEncoded'
 'Customer SegmentEncoded' 'Product CategoryEncoded'
 'Product Sub-CategoryEncoded' 'Product ContainerEncoded'
 'Product NameEncoded' 'State or ProvinceEncoded' 'CityEncoded']
[[1.0000e-02 2.8400e+00 9.3000e-01 ... 6.6500e+02 4.5000e+01 1.6000e+01]
 [2.0000e-02 5.0098e+02 2.6000e+01 ... 4.0800e+02 3.0000e+00 6.9100e+02]
 [6.0000e-02 9.4800e+00 7.2900e+00 ... 2.5700e+02 2.8000e+01 6.6600e+02]
 ...
 [8.0000e-02 1.1970e+01 5.8100e+00 ... 7.1800e+02 1.1000e+01 1.8300e+02]
 [1.0000e-01 9.

In [None]:
#Using the selected features from SelectKBest into linear regression model
selected_features=['Discount','Unit Price', 'Product Base Margin' ,'Quantity ordered new',
 'Sales' ,'Order PriorityEncoded', 'Ship ModeEncoded',
 'Customer SegmentEncoded' ,'Product CategoryEncoded',
 'Product Sub-CategoryEncoded', 'Product ContainerEncoded',
 'Product NameEncoded', 'State or ProvinceEncoded', 'CityEncoded' ]
X=df[selected_features]
y=df['Profit']

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test , y_train, y_test = train_test_split(X,y,test_size=0.10, random_state = 5)

In [None]:
from sklearn.metrics import mean_squared_error
import math
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import explained_variance_score

In [None]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train,y_train)
pred1 = lr.predict(X_test)
train_Score1 =lr.score(X_train,y_train)
test_Score1 =lr.score(X_test,y_test)
r2score1 =r2_score(y_test,pred1)
r2s=r2score1
mse1=mean_squared_error(y_test, pred1)
rmse1=math.sqrt(mse1)
mae1 = mean_absolute_error(y_test, pred1)
evs1=explained_variance_score(pred1,y_test)
print('Train Score :',train_Score1)
print('Test Score :',test_Score1)
print('R2 score :',r2score1)
print('RMSE :',rmse1)
print('MSE: ',mse1)
print('MAE: ',mae1)
print('Explained Variance :',evs1)

Train Score : 0.3475888638973871
Test Score : -0.01850600041432582
R2 score : -0.01850600041432582
RMSE : 1352.8294575513225
MSE:  1830147.5412186058
MAE:  435.4728566274663
Explained Variance : -9.723740889736767
