In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
import plotly.express as py
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.offline import init_notebook_mode, iplot
import cufflinks as cf
init_notebook_mode(connected=True)
cf.go_offline()

In [3]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
import statsmodels.formula.api as sm

In [4]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
df = pd.read_csv('50_Startups.csv')
df

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94
5,131876.9,99814.71,362861.36,New York,156991.12
6,134615.46,147198.87,127716.82,California,156122.51
7,130298.13,145530.06,323876.68,Florida,155752.6
8,120542.52,148718.95,311613.29,New York,152211.77
9,123334.88,108679.17,304981.62,California,149759.96


In [6]:
df.shape

(50, 5)

In [7]:
df.describe()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
count,50.0,50.0,50.0,50.0
mean,73721.6156,121344.6396,211025.0978,112012.6392
std,45902.256482,28017.802755,122290.310726,40306.180338
min,0.0,51283.14,0.0,14681.4
25%,39936.37,103730.875,129300.1325,90138.9025
50%,73051.08,122699.795,212716.24,107978.19
75%,101602.8,144842.18,299469.085,139765.9775
max,165349.2,182645.56,471784.1,192261.83


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [9]:
df.isna().sum()

R&D Spend          0
Administration     0
Marketing Spend    0
State              0
Profit             0
dtype: int64

In [10]:
df.columns

Index(['R&D Spend', 'Administration', 'Marketing Spend', 'State', 'Profit'], dtype='object')

In [11]:
df = df.rename({'R&D Spend' : 'RandD_Spend', 'Marketing Spend' : 'Marketing_Spend'}, axis = 1)

In [12]:
df.columns

Index(['RandD_Spend', 'Administration', 'Marketing_Spend', 'State', 'Profit'], dtype='object')

In [13]:
fig = py.imshow(df.corr(), text_auto=True, aspect="auto", color_continuous_scale='gnbu')
fig.show()

In [14]:
df.iplot(kind='box')

In [15]:
df['RandD_Spend'].iplot(kind='hist', color='blue')

In [16]:
x = df[['RandD_Spend', 'Administration', 'Marketing_Spend']]
x.head(1)

Unnamed: 0,RandD_Spend,Administration,Marketing_Spend
0,165349.2,136897.8,471784.1


In [17]:
y = df[['Profit']]
y.head(1)

Unnamed: 0,Profit
0,192261.83


In [18]:
plot = py.scatter(df, x=x.columns, y="Profit", trendline='ols')
plot.show()

Plot Based on R&D Spend with Profit

In [19]:
plot = py.scatter(df, x="RandD_Spend", y="Profit", trendline='ols', labels={'RandD_Spend': 'R and D Spend'})
plot.show()

Plot Based on Administration with Profit

In [20]:
plot = py.scatter(df, x="Administration", y="Profit", trendline='ols')
plot.show()

Plot Based on Marketing Spend with Profit

In [21]:
plot = py.scatter(df, x="Marketing_Spend", y="Profit", trendline='ols', labels={'Marketing_Spend': 'Marketing Spend'})
plot.show()

## Implement Linear Regrassion

In [22]:
lr = LinearRegression()
lr.fit(x, y)

In [23]:
pred = lr.predict(x)

In [24]:
r2_score(y, pred)

0.9507459940683246

In [25]:
mean_squared_error(y, pred)

78417126.01913083

### Using OLS Model

In [26]:
model1 = sm.ols('Profit~RandD_Spend+Administration+Marketing_Spend', data = df).fit()
model1.summary()

0,1,2,3
Dep. Variable:,Profit,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,296.0
Date:,"Fri, 18 Aug 2023",Prob (F-statistic):,4.53e-30
Time:,13:16:49,Log-Likelihood:,-525.39
No. Observations:,50,AIC:,1059.0
Df Residuals:,46,BIC:,1066.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,5.012e+04,6572.353,7.626,0.000,3.69e+04,6.34e+04
RandD_Spend,0.8057,0.045,17.846,0.000,0.715,0.897
Administration,-0.0268,0.051,-0.526,0.602,-0.130,0.076
Marketing_Spend,0.0272,0.016,1.655,0.105,-0.006,0.060

0,1,2,3
Omnibus:,14.838,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.442
Skew:,-0.949,Prob(JB):,2.21e-05
Kurtosis:,5.586,Cond. No.,1400000.0


In [27]:
df['predection'] = model1.fittedvalues
df

Unnamed: 0,RandD_Spend,Administration,Marketing_Spend,State,Profit,predection
0,165349.2,136897.8,471784.1,New York,192261.83,192521.25289
1,162597.7,151377.59,443898.53,California,191792.06,189156.768232
2,153441.51,101145.55,407934.54,Florida,191050.39,182147.279096
3,144372.41,118671.85,383199.62,New York,182901.99,173696.700026
4,142107.34,91391.77,366168.42,Florida,166187.94,172139.514183
5,131876.9,99814.71,362861.36,New York,156991.12,163580.780571
6,134615.46,147198.87,127716.82,California,156122.51,158114.096669
7,130298.13,145530.06,323876.68,Florida,155752.6,160021.363048
8,120542.52,148718.95,311613.29,New York,152211.77,151741.699699
9,123334.88,108679.17,304981.62,California,149759.96,154884.68411


### Predicting for new data

In [28]:
newdata = pd.DataFrame({'RandD_Spend':187465,"Administration":211897.10,"Marketing_Spend":499657.90}, index=[1])
newdata

Unnamed: 0,RandD_Spend,Administration,Marketing_Spend
1,187465,211897.1,499657.9


In [29]:
lr.predict(newdata)

array([[209088.05656522]])

In [30]:
model1.predict(newdata)

1    209088.056565
dtype: float64