In [1]:
import pandas as pd
import numpy as np
import pandas_profiling

from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

from sklearn.metrics import accuracy_score, mean_absolute_error
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

import random
import os
seed = 42
random.seed(seed)
np.random.seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)

In [2]:
def _regression(x, y, result:list):
    x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1)
    
    for model in [KNeighborsRegressor(), KNeighborsRegressor(weights='distance'), LinearRegression(), DecisionTreeRegressor(), RandomForestRegressor()]:
        model.fit(x_train, y_train)
        res = model.predict(x_test)
        result.append({'model': str(model), 'score': mean_absolute_error(res, y_test)})

In [3]:
data = pd.read_csv('../input/2022-ml-midterm-p4/train.csv')
test = pd.read_csv('../input/2022-ml-midterm-p4/test.csv')

le = LabelEncoder()
models = data.model.append(test.model).drop_duplicates()
le.fit(models)

data['model'] = le.transform(data['model'])
test['model'] = le.transform(test['model'])

x = data.drop(['ID', 'price'], axis=1)
y = data['price']
test = test.drop('ID', axis=1)

sc = StandardScaler()
x = sc.fit_transform(x, y)
test = sc.transform(test)

In [4]:
result = []
_regression(x, y, result)
print(pd.DataFrame(result))

                                     model        score
0                    KNeighborsRegressor()  1422.863935
1  KNeighborsRegressor(weights='distance')  1373.082817
2                       LinearRegression()  3416.653967
3                  DecisionTreeRegressor()  1531.536589
4                  RandomForestRegressor()  1210.256632


In [5]:
submit = pd.read_csv('../input/2022-ml-midterm-p4/sample.csv')
print(submit)
model = KNeighborsRegressor(n_neighbors=7, weights='distance')
model.fit(x, y)
res = model.predict(test)
submit['price'] = res
submit.to_csv('submission.csv', index=False)

          ID  price
0          0      1
1          1      1
2          2      1
3          3      1
4          4      1
...      ...    ...
16826  16826      1
16827  16827      1
16828  16828      1
16829  16829      1
16830  16830      1

[16831 rows x 2 columns]
