## Collaborative Filtering Models for AirBnB

#### LIBRARIES

In [7]:
import pandas as pd
import numpy as np

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession


#### Reading a file

In [3]:
#I have used music track dataset.
df=pd.read_csv('shared_final_prediction_file.csv', sep=',')# read csv into df
df = df.drop('rowid', axis = 1)

In [4]:
df.head()

Unnamed: 0,listing_id,reviewer_id,polarity
0,2539,25160947,0.6249
1,2539,91513326,0.9228
2,2539,90022459,0.4754
3,2539,116165195,0.9231
4,2539,1806142,0.9881


In [5]:
sc = SparkContext()

In [35]:
sqlContext = SQLContext(sc)


spark = SparkSession.builder \
.master('local') \
.appName('Data cleaning') \
.getOrCreate()

df = spark.read.format('csv').option('header','true').option('mode','DROPMALFORMED').load('shared_final_prediction_file.csv')

changedTypedf = df.withColumn("listing_id", df["listing_id"].cast("long"))
changedTypedf2 = changedTypedf.withColumn("reviewer_id", changedTypedf["reviewer_id"].cast("long"))
changedTypedf3 = changedTypedf2.withColumn("polarity", changedTypedf2["polarity"].cast("float"))

splitDF1, splitDF2 = changedTypedf3.randomSplit([0.7, 0.3])

# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=10,  regParam=0.3, rank=10,nonnegative=False, userCol="reviewer_id", itemCol="listing_id", ratingCol="polarity")
#als = ALS(maxIter=10,  regParam=0.15, rank=5,  userCol="user_id", itemCol="book_id", ratingCol="rating")
model = als.fit(splitDF1)



In [11]:
# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(splitDF2)


In [13]:
changedPreddf = predictions.withColumn("polarity", predictions["polarity"].cast("double"))
changedPreddf2 = changedPreddf.withColumn("prediction", changedPreddf["prediction"].cast("double"))
changedPreddf2 = changedPreddf2.dropna()

In [17]:
pandas_final_df = changedPreddf2.toPandas()

In [22]:
pandas_final_df.head()

Unnamed: 0,rowid,listing_id,reviewer_id,polarity,prediction
0,1485,5803,4213410,0.969,0.036598
1,1421,5803,281328,0.9712,-0.068157
2,1435,5803,2727672,0.9805,0.101855
3,1480,5803,33059289,0.101,-0.030405
4,1497,5803,56281844,0.9533,0.032358


In [24]:
pandas_final_df = pandas_final_df.drop('rowid', axis = 1)

In [29]:
pandas_final_df.head()

Unnamed: 0,listing_id,reviewer_id,polarity,prediction
0,5803,4213410,0.969,0.036598
1,5803,281328,0.9712,-0.068157
2,5803,2727672,0.9805,0.101855
3,5803,33059289,0.101,-0.030405
4,5803,56281844,0.9533,0.032358


In [26]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="polarity",
                                predictionCol="prediction")
rmse = evaluator.evaluate(changedPreddf2)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 0.8535667149338011


Now we will pull top three recommendation for each user;

In [33]:
from collections import defaultdict
 
def get_top10_recommendations(predictions, topN = 10):
     
    top_recs = defaultdict(list)
    print (top_recs)
    for uid, iid, true_r, est, _ in predictions:
        top_recs[uid].append((iid, est))
     
    for uid, user_ratings in top_recs.items():
        user_ratings.sort(key = lambda x: x[1], reverse = True)
        top_recs[uid] = user_ratings[:topN]
     
    return top_recs

In [34]:
top10_recommendations = get_top10_recommendations(pandas_final_df)

defaultdict(<class 'list'>, {})


ValueError: too many values to unpack (expected 5)

In [30]:
pandas_final_df.head()

Unnamed: 0,listing_id,reviewer_id,polarity,prediction
0,5803,4213410,0.969,0.036598
1,5803,281328,0.9712,-0.068157
2,5803,2727672,0.9805,0.101855
3,5803,33059289,0.101,-0.030405
4,5803,56281844,0.9533,0.032358


In [None]:
# For each user we print 10 recommendation
i=0;
for uid, user_ratings in top3_recommendations.items():
    print(uid, [iid for (iid, _) in user_ratings])
    i=i+1;
    if(i==10):
        break;

### Tips

1.Surprise dataset function just takes three columns,user-item and ratings so be careful.

2.Building Antitest set gives you all the unknown user-item ratings,you may not require all of them.

3.Explore more and have fun!
