In [2]:
import pandas as pd
import numpy as np

from sklearn.linear_model import Lasso, Ridge
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [3]:
df = pd.read_csv("50_Startups (1).csv")

In [4]:
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [5]:
df.describe()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
count,50.0,50.0,50.0,50.0
mean,73721.6156,121344.6396,211025.0978,112012.6392
std,45902.256482,28017.802755,122290.310726,40306.180338
min,0.0,51283.14,0.0,14681.4
25%,39936.37,103730.875,129300.1325,90138.9025
50%,73051.08,122699.795,212716.24,107978.19
75%,101602.8,144842.18,299469.085,139765.9775
max,165349.2,182645.56,471784.1,192261.83


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [7]:
df['State'].unique()

array(['New York', 'California', 'Florida'], dtype=object)

In [8]:
num_col = df.select_dtypes(exclude = 'object')
cat_col = df.select_dtypes(include = 'object')

In [9]:
cat_col.head()

Unnamed: 0,State
0,New York
1,California
2,Florida
3,New York
4,Florida


In [10]:
ohe = OneHotEncoder(drop= 'first')

In [11]:
data1 = ohe.fit_transform(cat_col).toarray()

In [12]:
data1

array([[0., 1.],
       [0., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 0.],
       [1., 0.],
       [0., 1.],
       [0., 0.],
       [1., 0.],
       [0., 0.],
       [1., 0.],
       [0., 0.],
       [1., 0.],
       [0., 1.],
       [0., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 0.],
       [1., 0.],
       [0., 0.],
       [0., 1.],
       [1., 0.],
       [0., 0.],
       [0., 1.],
       [0., 0.],
       [0., 0.],
       [1., 0.],
       [0., 0.],
       [0., 1.],
       [0., 0.],
       [0., 1.],
       [1., 0.],
       [0., 0.],
       [0., 1.],
       [0., 0.]])

In [13]:
data1 = pd.DataFrame(data1, columns=['State_0','State_1'])

In [14]:
x = pd.concat([num_col, data1], axis = 'columns')

In [15]:
x.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,State_0,State_1
0,165349.2,136897.8,471784.1,192261.83,0.0,1.0
1,162597.7,151377.59,443898.53,191792.06,0.0,0.0
2,153441.51,101145.55,407934.54,191050.39,1.0,0.0
3,144372.41,118671.85,383199.62,182901.99,0.0,1.0
4,142107.34,91391.77,366168.42,166187.94,1.0,0.0


In [16]:
X = x.drop('Profit', axis = 'columns')
y = x['Profit']

In [17]:
model = Lasso()

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

In [27]:
def model_eval(true, pred):
    mse = mean_squared_error(true, pred)
    mae = mean_absolute_error(true, pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(true, pred)
    return mse, mae, rmse, r2

In [28]:
models = {
    'Lasso Model':Lasso(),
    'Ridge Model' : Ridge()
}

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)
    pred = model.predict(X_test)

    # Evaluating the models :
    mse, mae, rmse, r2 = model_eval(y_test, pred)

    print('-------------------------------------------------\n')
    print("{} Model Has :".format(list(models.keys())[i]))
    print('Mean Squared Error : {}'.format(mse))
    print('Mean Absolute Errro {} :'.format(mae))
    print('Root Means Squared Error {}'.format(rmse))
    print('R2 Score : {} \n'.format(r2))

    print('---------------------------------------------------\n')

-------------------------------------------------

Lasso Model Model Has :
Mean Squared Error : 199715243.80987325
Mean Absolute Errro 9803.710524894666 :
Root Means Squared Error 14132.064385993763
R2 Score : 0.9091047018448217 

---------------------------------------------------

-------------------------------------------------

Ridge Model Model Has :
Mean Squared Error : 198795412.28276825
Mean Absolute Errro 9775.6726944195 :
Root Means Squared Error 14099.482695573204
R2 Score : 0.909523339697966 

---------------------------------------------------

