### Multiple Linear Regression 

In [68]:
import numpy as np 
import pandas as pd
import sklearn
import matplotlib.pyplot as plt

In [69]:
dataset = pd.read_csv('50_Startups.csv')
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [70]:
X = dataset.iloc[:,:-1]
y = dataset.iloc[:,-1]


In [71]:
X['State'].value_counts()

California    17
New York      17
Florida       16
Name: State, dtype: int64

In [72]:
X['State'].isnull().sum()

0

In [73]:
states = pd.get_dummies(X['State'],drop_first = True)
states

# dummy variable trap 00->california,01->New york,10->Florida

Unnamed: 0,Florida,New York
0,0,1
1,0,0
2,1,0
3,0,1
4,1,0
5,0,1
6,0,0
7,1,0
8,0,1
9,0,0


In [74]:
X.drop('State',axis = 1,inplace = True)

In [75]:
X = pd.concat([X,states],axis = 1)

In [76]:
X.head(10)

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Florida,New York
0,165349.2,136897.8,471784.1,0,1
1,162597.7,151377.59,443898.53,0,0
2,153441.51,101145.55,407934.54,1,0
3,144372.41,118671.85,383199.62,0,1
4,142107.34,91391.77,366168.42,1,0
5,131876.9,99814.71,362861.36,0,1
6,134615.46,147198.87,127716.82,0,0
7,130298.13,145530.06,323876.68,1,0
8,120542.52,148718.95,311613.29,0,1
9,123334.88,108679.17,304981.62,0,0


In [77]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 0)

In [78]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(x_train,y_train)

LinearRegression()

In [81]:
y_pred = regressor.predict(x_test)

In [86]:
look = pd.DataFrame(np.array(y_test),columns=['Actual'])
look['predicted'] = y_pred
look

Unnamed: 0,Actual,predicted
0,103282.38,103015.201598
1,144259.4,132582.277608
2,146121.95,132447.738452
3,77798.83,71976.098513
4,191050.39,178537.482211
5,105008.31,116161.242302
6,81229.06,67851.692097
7,97483.56,98791.733747
8,110352.25,113969.43533
9,166187.94,167921.065696


In [96]:
# error and score 
# r2_score = 1 - ssres/ssmean
# ssres = 1/n * sum(y-y_bar)**2 , y_bar = predicted_value
# ssmean =1/n * sum(y-y_mean)**2

ssres = 0
ssmean = 0
n = len(look['Actual'])
mean = sum(look['Actual'])/n
for i in range(n):
    ssres +=(look['Actual'][i] - look['predicted'][i])**2
    ssmean +=(look['Actual'][i] - mean)**2
    
val = 1 - ssres/ssmean
print(val)

0.9347068473282423


In [98]:
# finding r2_score using sklearn 
from sklearn.metrics import r2_score 
score = r2_score(y_test,y_pred)
score

0.9347068473282423

# Multicollinearity in Linear Regression 