# 50 Startups dataset - K-Neighbors Regressor Example

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('50_Startups.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


# Data Preprocessing

In [4]:
data.isna().sum()

R&D Spend          0
Administration     0
Marketing Spend    0
State              0
Profit             0
dtype: int64

In [5]:
data['R&D Spend'].fillna( round(data['R&D Spend'].mean()) , inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [6]:
data['Marketing Spend'].fillna( round(data['Marketing Spend'].mean()) , inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [7]:
data['State'].fillna( data['State'].mode()[0] , inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [19]:
# Categorical Data Handling

# Using Sci-kit Package

from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse=False)

fState = ohe.fit_transform( np.array(data['State']).reshape(-1,1) )


features = np.concatenate( [fState , np.array(data.iloc[:,[0,1,2]])] , axis = 1)
#print(features)

In [20]:
label = data.iloc[:,[4]].values
#print(label)

# Implement K-Neighbors Regressor

In [21]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split

SL = 0.1
CL = 1 - SL
for i in range(1,401):
    X_train,X_test,y_train,y_test = train_test_split(features,label,test_size=0.1,random_state=i)
    model = KNeighborsRegressor(n_neighbors=5)
    model.fit(X_train,y_train)
    train_score = model.score(X_train,y_train)
    test_score = model.score(X_test,y_test)
    
    if test_score > train_score and test_score > CL:
        print("Test Score {}, Train Score {}, RandomSeed {}".format(test_score,train_score,i))

Test Score 0.955833249359642, Train Score 0.8455978942730887, RandomSeed 1
Test Score 0.9152432725356717, Train Score 0.8450600157501783, RandomSeed 2
Test Score 0.9205222077814073, Train Score 0.844809521066878, RandomSeed 3
Test Score 0.9503990063815226, Train Score 0.8458668403613925, RandomSeed 9
Test Score 0.9350659791404976, Train Score 0.8431596273167298, RandomSeed 11
Test Score 0.9080917614915126, Train Score 0.8583950730467481, RandomSeed 16
Test Score 0.9026020881073196, Train Score 0.8512130981341051, RandomSeed 21
Test Score 0.9712845881274329, Train Score 0.851978128828738, RandomSeed 22
Test Score 0.9710647177256582, Train Score 0.8411316909069131, RandomSeed 26
Test Score 0.9226471818560702, Train Score 0.8623642100459619, RandomSeed 33
Test Score 0.9366615187756205, Train Score 0.8494501786168597, RandomSeed 38
Test Score 0.9426072706043019, Train Score 0.8452984950399025, RandomSeed 39
Test Score 0.906208322578222, Train Score 0.7990892129965338, RandomSeed 48
Test Sc

In [18]:
#Final Model
X_train,X_test,y_train,y_test = train_test_split(features,label,test_size=0.1,random_state=60)
final_model = KNeighborsRegressor(n_neighbors=5)
final_model.fit(X_train,y_train)
print("Train score:", final_model.score(X_train,y_train))
print("Test score:", final_model.score(X_test,y_test))

Train score: 0.8552259199416467
Test score: 0.9932009504666949
