# Kaggle Models

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

## Load the data

In [30]:
# Load data
train = pd.read_csv('../datasets/clean/train_engineered.csv')

# Set features and target
X = train._get_numeric_data().drop(columns='SalePrice')
y = train['SalePrice']

# Save column names
features = list(X.columns)

---
**Linear Regression**

In [31]:
# Create model
model = LinearRegression()

# Cross validation
cval_score = cross_val_score(model, X, y).mean()
print(f'Cross validation score: {round(cval_score, 3)}')

# Fit the model on training data
model.fit(X, y)

Cross validation score: 0.919


LinearRegression()

Now that the models have been tested using train test split, I will use the entire train dataset to train the model. This allows the model more data to train on, thus improving fit quality.

### Make predictions

In [34]:
test = pd.read_csv('../datasets/clean/test_engineered.csv')

# Set features and target
X_test = test._get_numeric_data().drop(columns = 'Id')
ids = test['Id']

# Package dataframe for submission
preds = pd.DataFrame([ids, model.predict(X_test)]).T
preds.rename(columns = {'Unnamed 0': 'SalePrice'}, inplace = True)
preds['Id'] = preds['Id'].astype(int)
preds.sort_values(by = 'Id', inplace = True, ignore_index = True)

In [35]:
preds.to_csv('../datasets/submissions/mlr.csv', index = False)

---
**Ridge Regression**

In [36]:
# Scale data
sc = StandardScaler()
Z = sc.fit_transform(X)

In [37]:
# Create list of alphas
r_alphas = np.logspace(0, 5, 100)

# Cross-validate over our list of ridge alphas.
ridge_cv = RidgeCV(alphas=r_alphas, scoring='r2', cv=5)

# Fit model using best ridge alpha
ridge_cv.fit(Z, y);

### Make predictions

In [38]:
# Scale data
Z_test = sc.transform(X_test)

# Package dataframe for submission
preds = pd.DataFrame([ids, ridge_cv.predict(Z_test)]).T
preds.rename(columns = {'Unnamed 0': 'SalePrice'}, inplace = True)
preds['Id'] = preds['Id'].astype(int)
preds.sort_values(by = 'Id', inplace = True, ignore_index = True)

In [39]:
preds.to_csv('../datasets/submissions/ridge.csv', index = False)

---
**Lasso Regression**

In [40]:
# Set up a list of Lasso alphas to check
l_alphas = np.logspace(-3, 0, 100)

# Cross-validate over our list of Lasso alphas
lasso_cv = LassoCV(alphas=l_alphas, cv=5)

# Fit model using best ridge alpha
lasso_cv.fit(Z, y);

  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng

### Make predictions

In [41]:
# Package dataframe for submission
preds = pd.DataFrame([ids, lasso_cv.predict(Z_test)]).T
preds.rename(columns = {'Unnamed 0': 'SalePrice'}, inplace = True)
preds['Id'] = preds['Id'].astype(int)
preds.sort_values(by = 'Id', inplace = True, ignore_index = True)

In [42]:
preds.to_csv('../datasets/submissions/lasso.csv', index = False)