### Polynomial regression - accuracy 78063.718

In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_regression

# Load your datasets
train = pd.read_csv('dataset/train.csv')
test = pd.read_csv('dataset/test.csv')

# Clean 'engine' column to extract numeric horsepower values
def extract_hp(engine_str):
    try:
        return float(engine_str.split('HP')[0])
    except:
        return np.nan

train['engine'] = train['engine'].apply(extract_hp)
test['engine'] = test['engine'].apply(extract_hp)

# Fill missing engine values with the mean of the column
train['engine'].fillna(train['engine'].mean(), inplace=True)
test['engine'].fillna(test['engine'].mean(), inplace=True)

# Splitting the train data into features and target
x_train = train.drop(columns=['price'])
y_train = train['price']

# Convert categorical columns to dummy variables
x_train = pd.get_dummies(x_train)
test = pd.get_dummies(test)

# Ensure both train and test sets have the same columns after encoding
x_train, test = x_train.align(test, join='left', axis=1, fill_value=0)

# Check for any remaining NaN values and fill with 0
x_train.fillna(0, inplace=True)
test.fillna(0, inplace=True)

# Split the data into training and validation sets
x_train_split, x_val, y_train_split, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

# Degrees to be tested
degrees = [2, 3]

# Feature selection
selector = SelectKBest(score_func=f_regression, k=20)
x_train_selected = selector.fit_transform(x_train_split, y_train_split)
x_val_selected = selector.transform(x_val)
test_selected = selector.transform(test)

# Create and fit the polynomial regression models with different degrees
polynomial_r2_train_scores = []
polynomial_r2_test_scores = []
polynomial_MSE_train_scores = []
polynomial_MSE_test_scores = []

for degree in degrees:
    poly_features = PolynomialFeatures(degree=degree)
    X_train_poly = poly_features.fit_transform(x_train_selected)
    X_val_poly = poly_features.transform(x_val_selected)
    test_poly = poly_features.transform(test_selected)
    
    model = LinearRegression()
    model.fit(X_train_poly, y_train_split)
    
    y_train_pred = model.predict(X_train_poly)
    y_val_pred = model.predict(X_val_poly)
    
    polynomial_r2_train = r2_score(y_train_split, y_train_pred)
    polynomial_r2_test = r2_score(y_val, y_val_pred)
    
    polynomial_r2_train_scores.append(polynomial_r2_train)
    polynomial_r2_test_scores.append(polynomial_r2_test)
    
    polynomial_MSE_train = mean_squared_error(y_train_split, y_train_pred)
    polynomial_MSE_test = mean_squared_error(y_val, y_val_pred)
    
    polynomial_MSE_train_scores.append(polynomial_MSE_train)
    polynomial_MSE_test_scores.append(polynomial_MSE_test)

# Find the best degree based on the validation R-squared score
best_degree = degrees[np.argmax(polynomial_r2_test_scores)]

print(f"\nPolynomial Linear Regression:")
print(f"Best Degree: {best_degree}")
print(f"Training R-squared Scores: {polynomial_r2_train_scores}")
print(f"Validation R-squared Scores: {polynomial_r2_test_scores}")
print(f"Training MSE Scores: {polynomial_MSE_train_scores}")
print(f"Validation MSE Scores: {polynomial_MSE_test_scores}")

# Predict on the test data using the best degree
best_poly_features = PolynomialFeatures(degree=best_degree)
X_train_poly = best_poly_features.fit_transform(selector.transform(x_train))
test_poly = best_poly_features.transform(selector.transform(test))

best_model = LinearRegression()
best_model.fit(X_train_poly, y_train)
test_predictions = best_model.predict(test_poly)

# Prepare submission
submission = pd.DataFrame({'id': test['id'], 'price': test_predictions})
submission.to_csv('submission_polynomial_best.csv', index=False)
print(submission.tail())


In [6]:

print(submission.head())


      id         price
0  54273  22451.752866
1  54274  25011.752866
2  54275  30067.752866
3  54276  59827.752866
4  54277  43443.752866
