## Linear regression

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)

In [None]:
#fit the model and obtain pred response
lm_model = LinearRegression(normalize=True)
lm_model.fit(X_train, y_train)
y_test_preds = lm_model.predict(X_test)
y_train_preds = lm_model.predict(X_train)

In [None]:
r2_test = r2_score(y_test, y_test_preds)
r2_train = r2_score(y_train, y_train_preds)

print(f'Performance on train: {r2_train*100:.2f}%')
print(f'Performance on test:  {r2_test*100:.2f}%')


In [None]:
## Train the model, get the coef and select the 5 most important variables

## Elimite features

In [None]:
listings.bed_type.value_counts()

In [84]:
regression_cols = [
#     'neighbourhood_cleansed',
    'neighbourhood_group_cleansed',
    'property_category',
    'cancellation_policy',
    'review_scores_rating',
    'room_type',
    'accommodates',
    'bathrooms',
    'bedrooms',
    'number_of_reviews',
    'calendar_updated_clean',
    'price' # y_value
                  ]

In [None]:
# report which features were selected by RFE
from sklearn.datasets import make_classification
from sklearn.feature_selection import RFE

#define dataset
X, y = clean_data(listings[regression_cols])

# Split train/test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

# define RFE
rfe = RFE(estimator=LinearRegression(normalize=True), n_features_to_select=10)
# fit RFE
rfe.fit(X_train, y_train)
# summarize all features
rfe_summary = {'columns': X.columns.tolist(),
               'selected': rfe.support_,
               'rank': rfe.ranking_
              }
selected = pd.DataFrame(data=rfe_summary).sort_values(by='rank')
selected.head(10)


In [None]:
y_pred = rfe.predict(X_test)

In [None]:
mean_squared_error(y_true=y_test, y_pred=y_pred)

In [None]:
mean_absolute_error(y_true=y_test, y_pred=y_pred)

In [None]:
fig, ax = plt.subplots()
performance.y_true.hist(bins=40)
ax.axvline(x=60)

In [None]:
mean_absolute_percentage_error(y_true=y_test, y_pred=y_pred)

In [None]:
SCORERS.keys()

In [None]:
#define dataset
X, y = clean_data(listings[regression_cols])

best_model = 100000

for n_features in range(1,21):
    # define RFE
    rfe = RFE(estimator=LinearRegression(normalize=True), n_features_to_select=n_features)
    model=LinearRegression(normalize=True)

    pipeline = Pipeline(steps=[('s',rfe),('m',model)])

    # Run cv
    cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=42)
    n_scores = cross_val_score(pipeline, X, y, scoring='r2', cv=cv, n_jobs=-1, error_score='raise')
    
    mean_score = np.mean(n_scores)
    std_score = np.std(n_scores)
    
    print(f'Features: {n_features}, Mean score {mean_score}, std: {std_score}')
    
    best_model = best_model if mean_score > best_model else mean_score


In [None]:
from sklearn.metrics import mean_absolute_error,mean_absolute_percentage_error

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline

## Try a K-nn for fun

In [None]:
from sklearn.neighbors import KNeighborsRegressor

In [None]:
def get_clean_split(df, columns=[]):    
    X,y = clean_data(df)
    
    if len(columns) > 0:
        X = X[columns]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

    return X_train, X_test, y_train, y_test    

In [None]:
X_train, X_test, y_train, y_test = get_clean_split(listings, columns=['bedrooms','bathrooms'])

In [None]:
knn = KNeighborsRegressor(n_neighbors=5)

In [None]:
knn.fit(X_train, y_train)

In [None]:
knn.score(X_test, y_test)

In [None]:
def run_knn_regression(df, columns=[], neighbours=[5]):
    scores = {}
    
    X_train, X_test, y_train, y_test = get_clean_split(df, columns)
    
    for neighbour in neighbours:
        print(f'Training model with {neighbour} neighours')
        knn = KNeighborsRegressor(n_neighbors=neighbour)
        knn.fit(X_train, y_train)
        scores[neighbour] = knn.score(X_test, y_test)
        
    return scores

In [None]:
knn_feats = {}
for top_feats in [3,5,7,9]:
    

    knn_feats[top_feats] = run_knn_regression(listings, columns=feature_ranks.head(top_feats).feature.tolist(), neighbours=[1,3,5,7,9,15])
    
pd.DataFrame(knn_feats)

## Looking at the Review data set
- `date` is stored as a string, turn that into the proper data type
- only a few missing values for the comments, as the comments are the only 

There seems to be not a lot of information in the reviews. The value of the review dataset is captured in the comments which need to be unlocked with NLP. For this project that is out is out of scope, and with that the dataset won't be included in the blog post

In [None]:
reviews = pd.read_csv('data/airbnb_seatle/reviews.csv')
df_summary_overview(reviews)

In [None]:
# Check empty comments:
display(reviews[reviews.comments.isnull()])

In [None]:
# The reviews above don't add any information, they might be invalid duplicates, lets check:

for idx, list_id, review_id in reviews.loc[reviews.comments.isnull(),['listing_id','reviewer_id']].itertuples():    
    display(reviews[(reviews.listing_id == list_id) & (reviews.reviewer_id == review_id)])
    
# two reviews have other entries that do have a comment. It is save to drop these

reviews = reviews.dropna(subset=['comments'])