In [0]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import lightgbm as lgb
import logging
import math
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from lightgbm.sklearn import LGBMClassifier

In [0]:
! wget -q https://www.dropbox.com/s/lhb1awpi769bfdr/test.csv?dl=1 -O test.csv
! wget -q https://www.dropbox.com/s/gudb5eunj700s7j/train.csv?dl=1 -O train.csv

# Read in the Dataset

First, we're just going to read in the dataset. We'll seperate the features from the target attribute, which in this dataset is **the price rating** of the individual listing. We'll output the distribution of labels in the training set to get an idea as to how our data is distributed.

In [0]:
# Read in the given datasets
Xy_train = pd.read_csv("train.csv", engine="python")
X_test = pd.read_csv("test.csv", engine="python")

# Seperate features from target in training set
X_train = Xy_train.drop(columns=["price_rating"])
y_train = Xy_train["price_rating"]

# Output number of training and test samples
print("Number of training samples: {}".format(len(X_train)))
print("Number of testing samples: {}".format(len(X_test)))

testing_ids = X_test.Id
# Histogram of price_rating quantities in the training set. 
Xy_train.price_rating.hist()

# Understand Our Data

The next step is to get an intuitive idea as to which attributes may be useful when building our model. We can view the statistical metrics for each numerical feature in our dataset in order to understand how they're distributed, along with getting a sense at which data might actually prove useful (i.e. if there are a lot of missing values, we may want to avoid a column).

For categorical data, we can't quite do this same operation. I'm going to select a subset of those features that I believe may have some predictive properties. This is a purely intuitive approach, but viewing the data gives us a clear sense of what we want to exclude (things like URLs, etc.)

In [0]:
X_train.info()
X_train.describe()

In [0]:
# Give us an idea of the non-numerical (mostly categorical data) we may want to use
X_train.select_dtypes(include=["object"])

# Clean the Data

The next step is to take the selected attributes and clean them up. For numerical data, this means we're going to replace the missing values (NaN) with the mean of the column, then standardize the data (0 mean with standard deviation 1). 

For categorical data, we'll replace missing values with an indicative **"missing_value"** tag that our model will use. We'll also "one-hot encode" the data, which is essentially splitting all of the potential values for a category into their own columns, with 1 representing it being present in a data, and 0 representing it's absence. This is done to avoid attributing scores to categorical data (i.e. instead of 1,2,3,4,...), as the model will learn that higher values of categorical values contain more meaning.

In [0]:
np.random.seed(0)

# Select numerical attributes to be used
numeric_features = ["latitude", "longitude", "accommodates", "bathrooms", 
                    "bedrooms", "beds", "square_feet","guests_included", 
                    "number_of_reviews", "review_scores_accuracy", 
                    "review_scores_cleanliness", "review_scores_checkin", 
                    "review_scores_communication", "review_scores_location", 
                    "review_scores_value", "reviews_per_month","availability_365",
                    "availability_90", "availability_30",
                    "availability_60", "number_of_reviews_ltm",
                    "minimum_nights","maximum_nights",
                    "minimum_nights_avg_ntm", "maximum_minimum_nights"]

# Impute and scale numerical data
numeric_transformer = Pipeline(steps=[
                ("imputer", SimpleImputer(fill_value="median")),
                ("scaler", StandardScaler())

])

# Select categorical features
categorical_features = ["host_is_superhost", "property_type",
                        "bed_type", "is_business_travel_ready",
                        "property_type", "host_is_superhost",
                        "room_type", "bed_type", "cancellation_policy"]

# Select categorical transformer
categorical_transformer = Pipeline(steps=[
                ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
                ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# Apply the transformers to the columns of the dataset
preprocessor = ColumnTransformer(
    transformers=[
          ("num", numeric_transformer, numeric_features),
          ("cat", categorical_transformer, categorical_features)
    ]
)

# Apply relevant pipeline operations to each of the column types
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Classification Using LightGBM

The first model that was used to fit the data was LightGBM. With small-medium structured data, ensemble tree methods tend to perform quite well, so using LightGBM made logical sense. Parameter grid is used to exhaustively search the hyperparameter configuration.

In [0]:
# Build classification pipeline for LightGBM
regr = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', LGBMClassifier(boosting_type='gbdt',
                        objective='multiclass',
                        num_classes=3,
                      ))])


# Select features from sets to be used
X_train = X_train[[*numeric_features, *categorical_features]]
X_test = X_test[[*numeric_features, *categorical_features]]

# `__` denotes attribute 
# (e.g. regressor__n_estimators means the `n_estimators` param for `regressor`
#  which is our xgb).
param_grid = {
    'preprocessor__num__imputer__strategy': ['median'],
    'regressor__num_leaves': [30, 45],
    'regressor__n_estimators': [200, 300],
    'regressor__max_depth': [10, 15],
    'regressor__learning_rate': [0.03, 0.05],
    'regressor__min_data_in_leaf' : [10, 20],
}

# Grid search the parameters above, 3-fold cv
grid_search = GridSearchCV(
    regr, param_grid, cv=3, verbose=3, n_jobs=1, 
    scoring='accuracy')

# Parameter optimization and model fitting
grid_search.fit(X_train, y_train)

# Output the best accuracy from the model fit on training
print('best score {}'.format(grid_search.best_score_))

In [0]:
# Get test set prediction and convert it into a cvb
y_pred = grid_search.predict(X_test)
pd.DataFrame(
    {'Id': testing_ids, 'price_rating':y_pred}).to_csv(r'./mainguy_submission_three.csv', index=False)

In [0]:
# mount and save to drive
from google.colab import drive
drive.mount('drive')

In [0]:
# Copy submission CSV to drive (needed when kaggle command not working)
!cp mainguy_submission_three.csv "drive/My Drive/"