In [1]:
from warnings import filterwarnings
filterwarnings('ignore')

### step 1: Read the dataset

In [2]:
import pandas as pd

df = pd.read_csv('50_Startups.csv')
df.head()

Unnamed: 0,RND,ADMIN,MKT,STATE,PROFIT
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


### Step 2: Perform Basic data quality checks

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   RND     50 non-null     float64
 1   ADMIN   50 non-null     float64
 2   MKT     50 non-null     float64
 3   STATE   50 non-null     object 
 4   PROFIT  50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [4]:
m = df.isna().sum()
m[m>0]

Series([], dtype: int64)

In [5]:
df.duplicated().sum()

np.int64(0)

### Step 3 : Seperate X and Y (Profit)

In [6]:
X = df.drop(columns=['PROFIT'])
Y = df[['PROFIT']]

In [7]:
X.head()

Unnamed: 0,RND,ADMIN,MKT,STATE
0,165349.2,136897.8,471784.1,New York
1,162597.7,151377.59,443898.53,California
2,153441.51,101145.55,407934.54,Florida
3,144372.41,118671.85,383199.62,New York
4,142107.34,91391.77,366168.42,Florida


In [8]:
Y.head()

Unnamed: 0,PROFIT
0,192261.83
1,191792.06
2,191050.39
3,182901.99
4,166187.94


### Step 4: Preprocessing on X

In [9]:
cat = list(X.columns[X.dtypes=='object'])
con = list(X.columns[X.dtypes!='object'])

In [10]:
cat

['STATE']

In [11]:
con

['RND', 'ADMIN', 'MKT']

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [13]:

num_pipe = Pipeline(steps=[('impute', SimpleImputer(strategy='mean')), ('scaler', StandardScaler())])

In [14]:
cat_pipe = Pipeline(steps=[('impute', SimpleImputer(strategy='most_frequent')), ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])

In [15]:
pre = ColumnTransformer([('num', num_pipe, con), ('cat', cat_pipe, cat)]).set_output(transform='pandas')

In [16]:
X_pre = pre.fit_transform(X)
X_pre

Unnamed: 0,num__RND,num__ADMIN,num__MKT,cat__STATE_California,cat__STATE_Florida,cat__STATE_New York
0,2.016411,0.560753,2.153943,0.0,0.0,1.0
1,1.95586,1.082807,1.9236,1.0,0.0,0.0
2,1.754364,-0.728257,1.626528,0.0,1.0,0.0
3,1.554784,-0.096365,1.42221,0.0,0.0,1.0
4,1.504937,-1.079919,1.281528,0.0,1.0,0.0
5,1.2798,-0.776239,1.25421,0.0,0.0,1.0
6,1.340066,0.932147,-0.68815,1.0,0.0,0.0
7,1.245057,0.87198,0.932186,0.0,1.0,0.0
8,1.030369,0.986952,0.830887,0.0,0.0,1.0
9,1.091819,-0.45664,0.776107,1.0,0.0,0.0


### Step 5: Apply train test split

In [17]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X_pre, Y, test_size=0.2, random_state=21)

In [18]:
xtrain.shape

(40, 6)

In [19]:
xtest.shape

(10, 6)

In [20]:

xtrain.head()

Unnamed: 0,num__RND,num__ADMIN,num__MKT,cat__STATE_California,cat__STATE_Florida,cat__STATE_New York
30,-0.258074,-0.205629,-0.990357,0.0,1.0,0.0
21,0.102724,1.169186,0.732788,0.0,0.0,1.0
19,0.279442,1.159837,-1.743127,0.0,0.0,1.0
11,0.593085,-1.06554,0.319834,1.0,0.0,0.0
22,0.006007,0.05185,0.762376,0.0,1.0,0.0


In [21]:
xtest.head()

Unnamed: 0,num__RND,num__ADMIN,num__MKT,cat__STATE_California,cat__STATE_Florida,cat__STATE_New York
7,1.245057,0.87198,0.932186,0.0,1.0,0.0
44,-1.134305,1.206419,-1.509074,1.0,0.0,0.0
43,-1.281134,0.217682,-1.449605,0.0,0.0,1.0
25,-0.199312,0.656489,-0.603517,1.0,0.0,0.0
14,1.017181,1.269199,0.375742,0.0,1.0,0.0


In [22]:
ytrain.head()

Unnamed: 0,PROFIT
30,99937.59
21,111313.02
19,122776.86
11,144259.4
22,110352.25


In [23]:
ytest.head()

Unnamed: 0,PROFIT
7,155752.6
44,65200.33
43,69758.98
25,107404.34
14,132602.65


### Step 6 : Build the model

In [25]:
from sklearn.tree import DecisionTreeRegressor
model1 = DecisionTreeRegressor(max_depth=1, min_samples_split=5, min_samples_leaf=5, criterion='squared_error')
model1.fit(xtrain, ytrain)



In [26]:
model1.score(xtrain, ytrain)

0.6268054892550098

In [27]:
model1.score(xtest, ytest)

0.6434222460818757

### Hyperparameter tuning 

In [30]:
params = {'max_depth' : [2,3,4,5,6,7,8,9,10,11],
          'min_samples_split' : [1,2,3,4,5,6,7,8,9,10],
          'min_samples_leaf' : [1,2,3,4,5,6,7,8,9,10],
          'criterion' : ['squared_error', 'absolute_error']}

In [31]:
from sklearn.model_selection import GridSearchCV
dtr = DecisionTreeRegressor()
gscv_dtr = GridSearchCV(estimator=dtr, param_grid=params, cv=3, scoring='r2')
gscv_dtr.fit(xtrain, ytrain)

In [32]:
gscv_dtr.best_params_

{'criterion': 'squared_error',
 'max_depth': 8,
 'min_samples_leaf': 1,
 'min_samples_split': 2}

In [33]:
gscv_dtr.best_score_

np.float64(0.8763082156671418)

In [34]:
best_dtr = gscv_dtr.best_estimator_
best_dtr

In [35]:
best_dtr.score(xtrain, ytrain)

1.0

In [36]:
best_dtr.score(xtest, ytest)

0.9282405178222957

In [37]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score

In [38]:
def evaluate_model(model, x, y):
    ypred = model.predict(x)

    mse = mean_squared_error(y, ypred)
    rmse = mse**(1/2)
    mae = mean_absolute_error(y, ypred)
    mape = mean_absolute_percentage_error(y, ypred)
    r2 = r2_score(y, ypred)

    print(f'Mean squared Error : {mse:.2f}')
    print(f'Root Mean squared Error : {rmse:.2f}')
    print(f'Mean Absolute Error : {mae:.2f}')
    print(f'Mean Absolute Percentage Error : {mape:.4f}')
    print(f'R2 score:{r2:.4f}')

In [39]:
evaluate_model(best_dtr, xtrain, ytrain)

Mean squared Error : 0.00
Root Mean squared Error : 0.00
Mean Absolute Error : 0.00
Mean Absolute Percentage Error : 0.0000
R2 score:1.0000


In [40]:
evaluate_model(best_dtr, xtest, ytest)

Mean squared Error : 127354916.33
Root Mean squared Error : 11285.16
Mean Absolute Error : 7891.39
Mean Absolute Percentage Error : 0.1103
R2 score:0.9282


### As r2 score is >0.8 hence it is good model

### Out of sample predictions

In [41]:
xnew = pd.read_csv('startups_sample.csv')
xnew.head()

Unnamed: 0,RND,ADMIN,MKT,STATE
0,100000,50000,60000.0,New York
1,50000,70000,50000.0,California
2,150000,100000,,Florida
3,130000,70000,10000.0,
4,70000,30000,30000.0,New York


In [43]:
pre

In [44]:
xnew_pre = pre.transform(xnew)
xnew_pre


Unnamed: 0,num__RND,num__ADMIN,num__MKT,cat__STATE_California,cat__STATE_Florida,cat__STATE_New York
0,0.578298,-2.572256,-1.24751,0.0,0.0,1.0
1,-0.522032,-1.851177,-1.330113,1.0,0.0,0.0
2,1.678628,-0.769559,0.0,0.0,1.0,0.0
3,1.238496,-1.851177,-1.660524,1.0,0.0,0.0
4,-0.0819,-3.293335,-1.495318,0.0,0.0,1.0


In [46]:
preds = best_dtr.predict(xnew_pre)
preds

array([129917.04,  99937.59, 182901.99, 156122.51, 110352.25])

In [47]:
xnew['PROFIT_pred'] = preds

In [48]:
xnew

Unnamed: 0,RND,ADMIN,MKT,STATE,PROFIT_pred
0,100000,50000,60000.0,New York,129917.04
1,50000,70000,50000.0,California,99937.59
2,150000,100000,,Florida,182901.99
3,130000,70000,10000.0,,156122.51
4,70000,30000,30000.0,New York,110352.25


In [49]:
xnew.to_csv("Results.csv", index=False)