# Recommended Imports

In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier

# Loading Files

Ensure you set these paths to the locations of your train/test csvs you've downloaded from Kaggle

In [13]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

#this prints the datatypes of the fields if imported properly
print(train_df.dtypes)

seatid                    int64
price                     int64
distance                  int64
height                    int64
seating_angle             int64
visibility                int64
turns_visible             int64
track_length_visible      int64
range_visible             int64
seat_type                object
sun_cover               float64
rain_cover              float64
speed_category           object
overtake_probability    float64
braking_zone              int64
sold_first_hour           int64
dtype: object


# Preprocessing
Here, you should do any processing required to run KNN (or decision trees).

- Get any categorical values (`seat_type`, `speed_category`) and use Scikit's `LabelEncoder`
- For numerical values (except for id/target), normalize them using Scikit's `StandardScaler`

In [14]:
id = "seatid" #set to the column representing id
target = "sold_first_hour" #set to the target column
categorical = ["seat_type", "speed_category"]  #list of non-numerical columns

#list of all columns that arent categorical, or the id/target
numerical = [col for col in train_df.columns if col not in categorical + [id, target]]

#use label encoder on each categorical column
for col in categorical:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col])
    test_df[col] = le.transform(test_df[col])  

#use standard scaling on numerical values here
scaler = StandardScaler()
train_df[numerical] = scaler.fit_transform(train_df[numerical])
test_df[numerical] = scaler.transform(test_df[numerical])

#define dataframes for model training here (X_train, y_train, X_test)
X_train = train_df.drop([id, target], axis=1)
y_train = train_df[target]
X_test = test_df.drop([id], axis=1)

# KMeans
Run KNN here and write the predictions to submission.csv

In [15]:
#fit and predict knn here, creating a target column in test_df
knn = KNeighborsClassifier()

knn.fit(X_train, y_train)
test_df[target] = knn.predict(X_test)

#export the test_df with columns [id, target] to "submission.csv"
submission = test_df[[id,target]]
submission.to_csv("out.csv", index=False)