# Test / Train

In [1]:
import numpy as np

In [2]:
x = np.arange(10).reshape(5,2)
x

array([[0, 1],
       [2, 3],
       [4, 5],
       [6, 7],
       [8, 9]])

In [3]:
y = range(5)
list(y)

[0, 1, 2, 3, 4]

# Importing sklearn

In [4]:
import pandas as pd

from sklearn.model_selection import train_test_split # to split data into train and test
from sklearn.linear_model import LinearRegression # for models and linear regression
from sklearn import metrics

In [5]:
x_train, x_test, y_train, y_test = train_test_split( x , y )

In [6]:
x_train

array([[4, 5],
       [8, 9],
       [2, 3]])

In [7]:
y_train

[2, 4, 1]

In [8]:
model = LinearRegression()

In [9]:
model.fit(x_train, y_train)

LinearRegression()

In [10]:
predict = model.predict(x_test)
predict

array([ 3.0000000e+00, -4.4408921e-16])

In [11]:
y_test

[3, 0]

In [12]:
metrics.mean_absolute_error(y_test, predict)

2.220446049250313e-16

# Housing Data

In [13]:
data = pd.read_csv('USA_Housing.csv')
data.describe()

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,68583.108984,5.977222,6.987792,3.98133,36163.516039,1232073.0
std,10657.991214,0.991456,1.005833,1.234137,9925.650114,353117.6
min,17796.63119,2.644304,3.236194,2.0,172.610686,15938.66
25%,61480.562388,5.322283,6.29925,3.14,29403.928702,997577.1
50%,68804.286404,5.970429,7.002902,4.05,36199.406689,1232669.0
75%,75783.338666,6.650808,7.665871,4.49,42861.290769,1471210.0
max,107701.748378,9.519088,10.759588,6.5,69621.713378,2469066.0


In [14]:
data['Price'].round()

0       1059034.0
1       1505891.0
2       1058988.0
3       1260617.0
4        630943.0
          ...    
4995    1060194.0
4996    1482618.0
4997    1030730.0
4998    1198657.0
4999    1298950.0
Name: Price, Length: 5000, dtype: float64

In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 7 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Avg. Area Income              5000 non-null   float64
 1   Avg. Area House Age           5000 non-null   float64
 2   Avg. Area Number of Rooms     5000 non-null   float64
 3   Avg. Area Number of Bedrooms  5000 non-null   float64
 4   Area Population               5000 non-null   float64
 5   Price                         5000 non-null   float64
 6   Address                       5000 non-null   object 
dtypes: float64(6), object(1)
memory usage: 273.6+ KB


# Dividing the Dataset

In [16]:
x = data[['Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Rooms', 'Avg. Area Number of Bedrooms', 'Area Population']]
y = data[['Price']]

# Splitting the data

In [20]:
x_train, x_test, y_train, y_test = train_test_split( x , y )
x_train

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population
3453,57852.033310,5.690224,7.287616,6.18,42688.461344
2166,65013.212446,7.321305,7.591328,3.13,23039.889474
1933,62128.784932,6.598323,7.969452,6.45,37889.761391
4644,60883.536887,6.281292,6.408819,2.37,31135.056781
2690,82943.889067,6.021106,6.278453,4.25,50524.736738
...,...,...,...,...,...
3550,76416.905353,4.652848,7.383606,4.46,49032.665856
51,49408.198340,5.825920,5.831739,3.32,26881.130598
4405,77089.653662,6.122816,6.265592,4.38,46740.886718
3831,64644.898809,3.232059,6.518794,2.13,45329.508837


# Training

In [21]:
model = LinearRegression()

In [22]:
model.fit(x_train, y_train)

LinearRegression()

# Predicting


In [23]:
predict = model.predict(x_test)
predict.round(2)

array([[1597217.9 ],
       [1295921.96],
       [1370804.09],
       ...,
       [1376775.95],
       [1393994.42],
       [ 767077.41]])

# MAE - Mean Absolute Error

In [24]:
metrics.mean_absolute_error(y_test, predict)

80890.9716458835

# MSE - Mean Squared Error

In [25]:
mse = metrics.mean_squared_error(y_test, predict)
mse

10138341331.31159

# RMSE - Root Mean Squared Error

In [26]:
rmse = np.sqrt(mse)
rmse

100689.33077199188

# Model Evaluvation

In [27]:
model.intercept_

array([-2641652.29833808])

In [28]:
model.coef_

array([[2.17320581e+01, 1.64461565e+05, 1.20069334e+05, 2.22238998e+03,
        1.52908614e+01]])

# Ecommerce Data

In [29]:
data = pd.read_csv('Ecommerce Customers')
data.describe()

Unnamed: 0,Avg. Session Length,Time on App,Time on Website,Length of Membership,Yearly Amount Spent
count,500.0,500.0,500.0,500.0,500.0
mean,33.053194,12.052488,37.060445,3.533462,499.314038
std,0.992563,0.994216,1.010489,0.999278,79.314782
min,29.532429,8.508152,33.913847,0.269901,256.670582
25%,32.341822,11.388153,36.349257,2.93045,445.038277
50%,33.082008,11.983231,37.069367,3.533975,498.887875
75%,33.711985,12.75385,37.716432,4.126502,549.313828
max,36.139662,15.126994,40.005182,6.922689,765.518462


In [30]:
data.columns

Index(['Email', 'Address', 'Avatar', 'Avg. Session Length', 'Time on App',
       'Time on Website', 'Length of Membership', 'Yearly Amount Spent'],
      dtype='object')

# Dividing the Dataset

In [31]:
x = data[['Avg. Session Length', 'Time on App',
       'Time on Website', 'Length of Membership']]
y = data[['Yearly Amount Spent']]

# Splitting the Data

In [32]:
x_train, x_test, y_train, y_test = train_test_split( x , y )
x_train

Unnamed: 0,Avg. Session Length,Time on App,Time on Website,Length of Membership
210,31.765619,12.442617,38.131712,3.850280
139,31.954904,10.963132,37.327283,3.578634
419,33.717555,10.806966,36.012317,3.701229
401,33.247322,11.956426,36.517346,3.451751
405,33.441553,11.235969,37.052616,3.904479
...,...,...,...,...
379,33.971722,12.284467,38.295725,1.130477
322,33.264632,10.732131,36.145792,4.086566
223,34.814984,12.114945,36.288724,4.389455
463,31.874552,10.290351,36.929762,3.491093


# Training

In [33]:
model = LinearRegression()

In [34]:
model.fit(x_train, y_train)

LinearRegression()

# Predicting

In [35]:
predict = model.predict(x_test)
predict.round(2)

array([[455.7 ],
       [487.35],
       [554.17],
       [621.25],
       [509.69],
       [518.29],
       [554.53],
       [505.97],
       [523.82],
       [287.36],
       [641.11],
       [535.73],
       [574.02],
       [453.45],
       [449.75],
       [555.37],
       [487.31],
       [458.33],
       [643.68],
       [584.51],
       [499.43],
       [494.25],
       [598.95],
       [578.28],
       [370.71],
       [435.3 ],
       [509.39],
       [583.52],
       [534.83],
       [316.7 ],
       [431.8 ],
       [442.36],
       [482.18],
       [501.9 ],
       [501.17],
       [463.94],
       [551.06],
       [515.93],
       [601.88],
       [516.78],
       [490.37],
       [544.13],
       [528.33],
       [528.48],
       [502.16],
       [461.6 ],
       [458.16],
       [526.11],
       [417.2 ],
       [525.13],
       [611.34],
       [543.54],
       [548.24],
       [410.25],
       [508.02],
       [478.78],
       [513.18],
       [461.17],
       [444.25

In [36]:
metrics.mean_absolute_error(y_test, predict)

7.877202490641659

In [37]:
mse = metrics.mean_squared_error(y_test, predict)
mse

98.6603387555007

In [38]:
rmse = np.sqrt(mse)
rmse

9.932791085868097

# Model Evaluvation

In [39]:
model.intercept_

array([-1036.13415366])

In [40]:
model.coef_

array([[2.58107738e+01, 3.83983681e+01, 1.66331069e-02, 6.20040777e+01]])