In [None]:
import pandas as pd
import numpy as np

In [None]:
WINDOW_SIZE = 20
CITY = "Philadelphia"

In [None]:
from langdetect import detect

In [None]:
users = pd.read_csv(
    "../users.csv"
)

In [None]:
reviews = pd.read_csv(
    "../Philadelphia_reviews.csv"
)
reviews['date'] = pd.to_datetime(reviews['date'], format="%Y-%m-%d %H:%M:%S")
reviews = reviews[["user_id", "business_id", "stars", "text", "date"]]

In [None]:
reviews['user_id_str'] = reviews.user_id
reviews.user_id = reviews.user_id.astype('category').cat.codes

In [None]:
reviews

In [None]:
reviews['len'] = reviews.text.apply(lambda x: len(x.split()))

In [None]:
reviews = reviews[reviews['len'] > 10]
reviews = reviews[reviews['len'] < 256]
reviews.drop(columns=['len'], inplace=True)

In [None]:
import fasttext as ft
ft_model = ft.load_model("./pretrained/lid.176.bin")

In [None]:
reviews['language'] = reviews.text.apply(lambda x: ft_model.predict(x.replace("\n", " "))[0][0])

In [None]:
reviews = reviews[reviews['language'] == "__label__en"]

In [None]:
reviews

In [None]:
Counter(reviews.language.tolist())

In [None]:
places = pd.read_csv(
    "../places.csv"
)
places = places[places["city"]==CITY]
places = places[["business_id", "name", "categories"]]
places["categories"].fillna("", inplace = True)
places['categories'] = places.categories.apply(lambda x: [el.strip() for el in x.split(',')])

In [None]:
places['business_id_str'] = places.business_id
places.business_id = places.business_id.astype('category').cat.codes

In [None]:
places

In [None]:
indexed_places = places.set_index("business_id_str")

In [None]:
categories = []
for _, data in places.iterrows():
    categories = categories + data.categories

In [None]:
from collections import Counter
popular_categories = [el[0] for el in Counter(categories).most_common()[:25]]

In [None]:
popular_categories

In [None]:
for category in popular_categories:
    places[category] = places["categories"].apply(
        lambda values: int(category in values)
    )

In [None]:
places

In [None]:
encoded_place_ids = []

for _, row in reviews.iterrows():
    place = indexed_places.loc[row.business_id]
    encoded_place_ids.append(place.business_id)
    print(_)
reviews['business_id'] = encoded_place_ids

In [None]:
reviews.drop(columns=['language'], inplace=True)

In [None]:
reviews

In [None]:
reviews.to_csv("data/reviews.csv", index=False, sep=",")

In [None]:
places.to_csv("data/places.csv", index=False, sep=",")

In [None]:
reviews_group = reviews.sort_values(by=["date"]).groupby("user_id")

reviews_data = pd.DataFrame(
    data={
        "user_id": list(reviews_group.groups.keys()),
        "place_ids": list(reviews_group.business_id.apply(list)),
        "texts": list(reviews_group.text.apply(list)),
        "ratings": list(reviews_group.stars.apply(list)),
        "timestamps": list(reviews_group.date.apply(list)),
    }
)


In [None]:
reviews_data.head()

In [None]:
sequence_length = 8
step_size = 1


def create_sequences(values, window_size, step_size):
    sequences = []
    start_index = 0
    while True:
        end_index = start_index + window_size
        seq = values[start_index:end_index]
        if len(seq) < window_size:
            seq = values[-window_size:]
            if len(seq) == window_size:
                sequences.append(seq)
            break
        sequences.append(seq)
        start_index += step_size
    return sequences


reviews_data.place_ids = reviews_data.place_ids.apply(
    lambda ids: create_sequences(ids, sequence_length, step_size)
)

reviews_data.ratings = reviews_data.ratings.apply(
    lambda ids: create_sequences(ids, sequence_length, step_size)
)

reviews_data.texts = reviews_data.texts.apply(
    lambda ids: create_sequences(ids, sequence_length, step_size)
)

del reviews_data["timestamps"]

In [None]:
reviews_data[reviews_data['ratings'].str.len() > 3]

In [None]:
reviews_data_places = reviews_data[["user_id", "place_ids"]].explode(
    "place_ids", ignore_index=True
)
reviews_data_rating = reviews_data[["ratings"]].explode("ratings", ignore_index=True)
reviews_data_text = reviews_data[["texts"]].explode("texts", ignore_index=True)


rewievs_data_transformed = pd.concat([reviews_data_places, reviews_data_text, reviews_data_rating], axis=1)

In [None]:
rewievs_data_transformed = rewievs_data_transformed.dropna()

In [None]:
rewievs_data_transformed

In [None]:
random_selection = np.random.rand(len(rewievs_data_transformed.index)) <= 0.85
train_data = rewievs_data_transformed[random_selection]
test_data = rewievs_data_transformed[~random_selection]

train_data.to_csv("data/train_data.csv", index=False, sep=",")
test_data.to_csv("data/test_data.csv", index=False, sep=",")

In [None]:
test_data