# 50 Startups dataset - K-Neighbors Regressor Example

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('50_Startups.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


# Data Preprocessing

In [4]:
data.isna().sum()

R&D Spend          0
Administration     0
Marketing Spend    0
State              0
Profit             0
dtype: int64

In [5]:
data['R&D Spend'].fillna( round(data['R&D Spend'].mean()) , inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [6]:
data['Marketing Spend'].fillna( round(data['Marketing Spend'].mean()) , inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [7]:
data['State'].fillna( data['State'].mode()[0] , inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [98]:
# Categorical Data Handling

# Using Sci-kit Package

from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse=False)

fState = ohe.fit_transform( np.array(data['State']).reshape(-1,1) )


features = np.concatenate( [fState , np.array(data.iloc[:,[0,1,2]])] , axis = 1)
print(features)
print(data.head())

[[0.0000000e+00 0.0000000e+00 1.0000000e+00 1.6534920e+05 1.3689780e+05
  4.7178410e+05]
 [1.0000000e+00 0.0000000e+00 0.0000000e+00 1.6259770e+05 1.5137759e+05
  4.4389853e+05]
 [0.0000000e+00 1.0000000e+00 0.0000000e+00 1.5344151e+05 1.0114555e+05
  4.0793454e+05]
 [0.0000000e+00 0.0000000e+00 1.0000000e+00 1.4437241e+05 1.1867185e+05
  3.8319962e+05]
 [0.0000000e+00 1.0000000e+00 0.0000000e+00 1.4210734e+05 9.1391770e+04
  3.6616842e+05]
 [0.0000000e+00 0.0000000e+00 1.0000000e+00 1.3187690e+05 9.9814710e+04
  3.6286136e+05]
 [1.0000000e+00 0.0000000e+00 0.0000000e+00 1.3461546e+05 1.4719887e+05
  1.2771682e+05]
 [0.0000000e+00 1.0000000e+00 0.0000000e+00 1.3029813e+05 1.4553006e+05
  3.2387668e+05]
 [0.0000000e+00 0.0000000e+00 1.0000000e+00 1.2054252e+05 1.4871895e+05
  3.1161329e+05]
 [1.0000000e+00 0.0000000e+00 0.0000000e+00 1.2333488e+05 1.0867917e+05
  3.0498162e+05]
 [0.0000000e+00 1.0000000e+00 0.0000000e+00 1.0191308e+05 1.1059411e+05
  2.2916095e+05]
 [1.0000000e+00 0.000

In [93]:
label = data.iloc[:,[4]].values.ravel()
#print(label)

# Implement K-Neighbors Regressor

In [10]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split

SL = 0.1
CL = 1 - SL
for i in range(1,401):
    X_train,X_test,y_train,y_test = train_test_split(features,label,test_size=0.1,random_state=i)
    model = KNeighborsRegressor(n_neighbors=5)
    model.fit(X_train,y_train)
    train_score = model.score(X_train,y_train)
    test_score = model.score(X_test,y_test)
    
    if test_score > train_score and test_score > CL:
        print("Test Score {}, Train Score {}, RandomSeed {}".format(test_score,train_score,i))

Test Score 0.955833249359642, Train Score 0.8455978942730887, RandomSeed 1
Test Score 0.9152432725356717, Train Score 0.8450600157501783, RandomSeed 2
Test Score 0.9205222077814073, Train Score 0.844809521066878, RandomSeed 3
Test Score 0.9503990063815226, Train Score 0.8458668403613925, RandomSeed 9
Test Score 0.9350659791404976, Train Score 0.8431596273167298, RandomSeed 11
Test Score 0.9080917614915126, Train Score 0.8583950730467481, RandomSeed 16
Test Score 0.9026020881073196, Train Score 0.8512130981341051, RandomSeed 21
Test Score 0.9712845881274329, Train Score 0.851978128828738, RandomSeed 22
Test Score 0.9710647177256582, Train Score 0.8411316909069131, RandomSeed 26
Test Score 0.9226471818560702, Train Score 0.8623642100459619, RandomSeed 33
Test Score 0.9366615187756205, Train Score 0.8494501786168597, RandomSeed 38
Test Score 0.9426072706043019, Train Score 0.8452984950399025, RandomSeed 39
Test Score 0.906208322578222, Train Score 0.7990892129965338, RandomSeed 48
Test Sc

In [11]:
#Final Model
X_train,X_test,y_train,y_test = train_test_split(features,label,test_size=0.1,random_state=60)
final_model = KNeighborsRegressor(n_neighbors=5)
final_model.fit(X_train,y_train)
print("Train score:", final_model.score(X_train,y_train))
print("Test score:", final_model.score(X_test,y_test))

Train score: 0.8552259199416467
Test score: 0.9932009504666949


# Implement Bagging Regressor with LinearRegression

In [12]:
# Linear Regression
from sklearn.ensemble import BaggingRegressor
from sklearn.linear_model import LinearRegression
algorithm = LinearRegression()

model = BaggingRegressor(n_estimators=100, #No of weak learners
                         base_estimator=algorithm) #The algo to be used for learning

model.fit(X_train,y_train)

BaggingRegressor(base_estimator=LinearRegression(), n_estimators=100)

In [13]:
print(model.score(X_train,y_train))
print(model.score(X_test,y_test))

0.9502959082946706
0.9489715900713226


# Implement Bagging Regressor with KNeighborsRegressor

In [14]:
# K-Neighbors Regressor
from sklearn.ensemble import BaggingRegressor
from sklearn.neighbors import KNeighborsRegressor
algorithm = KNeighborsRegressor()

model = BaggingRegressor(n_estimators=100, #No of weak learners
                         base_estimator=algorithm) #The algo to be used for learning

model.fit(X_train,y_train)

BaggingRegressor(base_estimator=KNeighborsRegressor(), n_estimators=100)

In [15]:
print(model.score(X_train,y_train))
print(model.score(X_test,y_test))

0.842463785895132
0.9731761304017041


# Decision Tree Regressor

In [16]:
from sklearn.tree import DecisionTreeRegressor
max_depth = 3
for i in range(1,401):
    X_train,X_test,y_train,y_test = train_test_split(features,label,test_size=0.2,random_state=i)
    model_tree = DecisionTreeRegressor(max_depth=max_depth)
    model_tree.fit(X_train, y_train)
    score_train = model_tree.score(X_train,y_train)
    score_test = model_tree.score(X_test,y_test)
    if score_test > score_train and score_test > 0.95:
        print("Train:", score_train, ", Test:", score_test, ", Seed:", i)

Train: 0.966298754312304 , Test: 0.9842310223890651 , Seed: 126
Train: 0.9656550439706006 , Test: 0.96595191432676 , Seed: 181


In [17]:
best_seed = 126
X_train,X_test,y_train,y_test = train_test_split(features,label,test_size=0.2,random_state=best_seed)
final_model_tree = DecisionTreeRegressor(max_depth=max_depth)
final_model_tree.fit(X_train, y_train)

DecisionTreeRegressor(max_depth=3)

In [18]:
score_train = final_model_tree.score(X_train,y_train)
score_test = final_model_tree.score(X_test,y_test)
print("Train:", score_train, ", Test:", score_test, ", Seed:", best_seed)

Train: 0.966298754312304 , Test: 0.9842310223890651 , Seed: 126


# PCA

In [125]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
features = sc.fit_transform(features)

In [126]:
from sklearn.decomposition import PCA
principalComponents = PCA(n_components=6) # 6 features in 50 startups dataset after OHE
principalComponents.fit(features,label)

PCA(n_components=6)

In [127]:
principalComponents.explained_variance_ratio_

array([3.26596913e-01, 2.51852089e-01, 2.16730846e-01, 1.68048197e-01,
       3.67719551e-02, 2.09922995e-32])

### Variance ratios are all low. Use n_components=1

In [128]:
principalComponentsFinal = PCA(n_components=1) #Here n_components = n_features
principalComponentsFinal.fit(features,label)

PCA(n_components=1)

In [129]:
principalComponentsFinal.explained_variance_ratio_

array([0.32659691])

In [130]:
finalFeatures = principalComponentsFinal.transform(features)

### KNeighborsRegressor?

In [131]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor

for i in range(1,1201):
    X_train,X_test,y_train,y_test = train_test_split(finalFeatures,label,test_size=0.1,random_state=i)
    knr = KNeighborsRegressor(n_neighbors=7)
    #model = BaggingRegressor(n_estimators=5, #No of weak learners
                         #base_estimator=knr) #The algo to be used for learning
    knr.fit(X_train,y_train)
    train_score = knr.score(X_train,y_train)
    test_score = knr.score(X_test,y_test)
    
    if test_score > train_score and test_score > CL:
        print("Test Score {}, Train Score {}, RandomSeed {}".format(test_score,train_score,i))

Test Score 0.9337211099023445, Train Score 0.7065275688397826, RandomSeed 17
Test Score 0.9005106670057083, Train Score 0.7036866689631882, RandomSeed 60
Test Score 0.9311720105842937, Train Score 0.6746590804109218, RandomSeed 306
Test Score 0.9483313736316629, Train Score 0.712237335451444, RandomSeed 325
Test Score 0.9243949173640865, Train Score 0.7025830292083617, RandomSeed 328
Test Score 0.9234323863128409, Train Score 0.6701512137225252, RandomSeed 378
Test Score 0.9317378379528075, Train Score 0.6912087249252477, RandomSeed 525
Test Score 0.9029943478247453, Train Score 0.7062550934085838, RandomSeed 529
Test Score 0.9524718142323688, Train Score 0.710419298738937, RandomSeed 542
Test Score 0.9149938562724854, Train Score 0.6854207630567628, RandomSeed 618
Test Score 0.9308653686349264, Train Score 0.6844836408432617, RandomSeed 623
Test Score 0.9461468231485978, Train Score 0.682818083844491, RandomSeed 687
Test Score 0.9628465728204758, Train Score 0.7133061907637538, Random

### Decision Tree?

In [46]:
from sklearn.tree import DecisionTreeRegressor
max_depth = 5
for i in range(1,401):
    X_train,X_test,y_train,y_test = train_test_split(finalFeatures,label,test_size=0.15,random_state=i)
    model_tree = DecisionTreeRegressor(max_depth=max_depth)
    model_tree.fit(X_train, y_train)
    score_train = model_tree.score(X_train,y_train)
    score_test = model_tree.score(X_test,y_test)
    if score_test > score_train and score_test > 0.75:
        print("Train:", score_train, ", Test:", score_test, ", Seed:", i)

Train: 0.8573144168783773 , Test: 0.8776734871682511 , Seed: 108
Train: 0.8866096669764724 , Test: 0.9042648548547481 , Seed: 178
Train: 0.8667833468838886 , Test: 0.8860698677067947 , Seed: 180


### XGBRegressor?

In [None]:
from xgboost import XGBRegressor

for i in range(1,101):
    X_train,X_test,y_train,y_test = train_test_split(finalFeatures,
                                                    label,
                                                    test_size=0.2,
                                                    random_state=i)
    
    model = XGBRegressor(learning_rate=0.001)
    model.fit(X_train,y_train)
    
    train_score = model.score(X_train,y_train)
    test_score = model.score(X_test,y_test)
    
    if test_score > train_score and test_score > 0.95:
        print("Test : {} Train: {} RS: {}".format(test_score,train_score,i))

### Despite the low training score, KNeighborsRegressor yielded the best results


In [77]:
best_seed = 328
best_n_neighbors = 10
X_train,X_test,y_train,y_test = train_test_split(finalFeatures,label,test_size=0.1,random_state=best_seed)
knr = KNeighborsRegressor(n_neighbors=best_n_neighbors)
knr.fit(X_train,y_train)
train_score = knr.score(X_train,y_train)
test_score = knr.score(X_test,y_test)

In [78]:
print("Test : {} Train: {} RS: {}".format(test_score,train_score,best_seed))

Test : 0.9544029307223478 Train: 0.684829857698274 RS: 328


 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64

In [133]:
## Deployment

rndSpend = float(input("Enter R&D Spending: "))
adminSpend =float(input("Enter Administration Spending: "))
marketSpend = float(input("Enter Marketing Spending: "))
state = input("Enter State: ")
if state == "New York":
    fState = [0,0,1]
elif state == "Florida":
    fState = [0,1,0]
else:
    fState = [1,0,0]

#FeatureSet
featureSet = np.concatenate( [[fState] , np.array([[rndSpend,adminSpend,marketSpend]])], axis = 1)

#Standardization
featureStandardizedSet = sc.transform(featureSet)

#transform features using PCA
principalCFeatures = principalComponentsFinal.transform(featureStandardizedSet)

#Predict
print(knr.predict(principalCFeatures))

Enter R&D Spending:  50000
Enter Administration Spending:  50000
Enter Marketing Spending:  50000
Enter State:  New York


[163574.9]


In [134]:
#Pickle Code
import pickle
#Deploy StandardScaler Object
pickle.dump(sc,open("ScalerFor50Startups.scale",'wb'))
#Deploy PCA object
pickle.dump(principalComponentsFinal,open('PCA.pca','wb'))
#Deploy Model
pickle.dump(knn,open('Model50Startups.pkl','wb'))