In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [87]:
def knn_function(X_train, y_train, X_test, k):
    prediction = []
    for i in range(len(X_test)):
        distance = np.linalg.norm(X_train - X_test[i], axis=1)
        nearest_neighbor_id = distance.argsort()[:k]
        nearest_neighbor_value = y_train[nearest_neighbor_id]
        prediction.append(nearest_neighbor_value.mean())   
        
    return np.array(prediction)

In [88]:
df = pd.read_csv('50_Startups.csv')


In [89]:
def my_train_test_split(X, y, test_size=0.2, random_state=None):
    X = np.array(X)
    y = np.array(y)
    
    if X.shape[0] != y.shape[0]:
        raise ValueError("X va Y khong cung chieu du lieu")
        
    n_samples = X.shape[0]
    
    n_train = int(n_samples - (n_samples * test_size))
    
    if random_state is not None:
        np.random.seed(random_state)
    
    indices = np.arange(n_samples)
    np.random.shuffle(indices)
    
    train_indices = indices[:n_train]
    test_indices = indices[n_train:]
    
    X_train = X[train_indices]
    y_train = y[train_indices]
    X_test = X[test_indices]
    y_test = y[test_indices]
    
    return X_train, X_test, y_train, y_test

In [90]:
print(df.isna().sum())

R&D Spend          0
Administration     0
Marketing Spend    0
State              0
Profit             0
dtype: int64


In [91]:
df_linear = df.drop(['State'], axis=1)

columns = df_linear.columns
print(columns)

for column in columns:
    df.loc[df_linear[column] == 0, column] = df_linear[column].sum() / df.shape[0]
    
X = df_linear.drop(['Profit'], axis=1)
y = df_linear['Profit']

print(X.info)
print(y.info)

Index(['R&D Spend', 'Administration', 'Marketing Spend', 'Profit'], dtype='object')
<bound method DataFrame.info of     R&D Spend  Administration  Marketing Spend
0   165349.20       136897.80        471784.10
1   162597.70       151377.59        443898.53
2   153441.51       101145.55        407934.54
3   144372.41       118671.85        383199.62
4   142107.34        91391.77        366168.42
5   131876.90        99814.71        362861.36
6   134615.46       147198.87        127716.82
7   130298.13       145530.06        323876.68
8   120542.52       148718.95        311613.29
9   123334.88       108679.17        304981.62
10  101913.08       110594.11        229160.95
11  100671.96        91790.61        249744.55
12   93863.75       127320.38        249839.44
13   91992.39       135495.07        252664.93
14  119943.24       156547.42        256512.92
15  114523.61       122616.84        261776.23
16   78013.11       121597.55        264346.06
17   94657.16       145077.58        2

In [92]:
X_train, X_test, y_train, y_test = my_train_test_split(X, y, 0.2, 42)

y_pred = knn_function(X_train, y_train, X_test, 5)

print (y_pred.T)
print (y_test.T)
print (f'RMSE ( Sklearn ): {np.sqrt(np.mean((y_test - y_pred ) ** 2))}')

[135412.448 115483.724 128154.858  67974.592 124434.054 139129.248
  90600.176 131634.642 102312.734  84694.63 ]
[146121.95 110352.25 124266.9   14681.4  118474.03 155752.6   71498.49
 132602.65 103282.38  81229.06]
RMSE ( Sklearn ): 19201.368554289733


In [93]:
import numpy as np
from sklearn . neighbors import KNeighborsRegressor

# Scikit - learn model
sk_model = KNeighborsRegressor (n_neighbors =5)
sk_model.fit (X_train , y_train )
sk_y_pred = sk_model.predict (X_test)

print (f'RMSE ( Sklearn ): {np.sqrt(np.mean((y_test - sk_y_pred ) ** 2))}')

RMSE ( Sklearn ): 19201.368554289733


In [97]:
def knn_class_function(X_train, y_train, X_test, k):
    prediction = []
    for i in range(len(X_test)):
        distance = np.linalg.norm(X_train - X_test[i], axis=1)
        nearest_neighbor_id = distance.argsort()[:k]
        nearest_neighbor_value = y_train[nearest_neighbor_id]
        
        counter = {}
        for neighbor in nearest_neighbor_value:
            counter[neighbor] = counter.get(neighbor, 0) + 1
        
        print(nearest_neighbor_value)
        prediction.append(max(counter, key=counter.get))   
        
    return np.array(prediction)

In [98]:
df['State'] = df['State'].replace({'Florida': 2, 'New York': 1, 'California': 0})

X = df.drop(['State'], axis=1)
y = df['State']

X_train, X_test, y_train, y_test = my_train_test_split(X, y, 0.2, 42)

y_pred = knn_class_function(X_train, y_train, X_test, 3)

print (y_pred.T)
print (y_test.T)

KeyError: 'State'

In [103]:
df = pd.read_csv('Iris.csv')

df['Species'] = df['Species'].replace({'Iris-setosa' : 0, 
                                       'Iris-versicolor' : 1, 
                                       'Iris-virginica' : 2})

X = df.drop(['Species'], axis=1)
y = df['Species']

X_train, X_test, y_train, y_test = my_train_test_split(X, y, 0.2, 42)

y_pred = knn_class_function(X_train, y_train, X_test, 5)

print (y_pred.T)
print (y_test.T)

[1 1 1 1 1]
[0 0 0 0 0]
[1 1 1 1 1]
[1 1 1 1 1]
[0 0 0 0 0]
[1 1 1 1 1]
[2 2 2 2 2]
[2 2 2 2 2]
[0 0 0 0 0]
[1 1 1 1 1]
[2 2 2 2 2]
[2 2 2 2 2]
[0 0 0 0 0]
[2 2 2 2 2]
[0 0 0 0 0]
[1 1 1 1 1]
[2 2 2 2 2]
[2 2 2 2 2]
[1 1 2 2 1]
[2 2 2 2 2]
[1 1 1 1 1]
[1 1 1 1 1]
[2 2 2 2 2]
[2 2 2 2 2]
[0 0 0 0 0]
[1 1 1 1 1]
[2 2 2 2 2]
[0 0 0 0 0]
[1 1 1 1 1]
[2 2 2 2 1]
[1 0 1 1 0 1 2 2 0 1 2 2 0 2 0 1 2 2 1 2 1 1 2 2 0 1 2 0 1 2]
[1 0 1 1 0 1 2 2 0 1 2 2 0 2 0 1 2 2 1 2 1 1 2 2 0 1 2 0 1 2]
