# Temporal Point Process Data Preprocessing

In [None]:
import os
import sys
import json
import pickle
import requests
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split

In [None]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

In [None]:
data_folder = os.path.join('..', 'data')

## Stack Overflow Badges

Download the `stackoverflow.com-Badges.7z` data from [Stack Exchange Data Dump](https://archive.org/details/stackexchange) and unzip it to `data/raw/stack_overflow/Badges.xml`. The data schema can be found from [here](https://meta.stackexchange.com/questions/2677/database-schema-documentation-for-the-public-data-dump-and-sede), and the badge types can be found from [there](https://meta.stackexchange.com/questions/67397/what-are-the-badges-i-can-earn-on-each-site-and-what-are-the-exact-criteria-for).

In [None]:
!head ../data/raw/stack_overflow/Badges.xml

### Transforming the XML

In [None]:
import xml.etree.ElementTree as ET
import csv

In [None]:
def xml_to_csv_chunked(xml_file, csv_file, chunk_size=1000):
    context = ET.iterparse(xml_file, events=('start', 'end'))
    context = iter(context)
    event, root = next(context)
    
    with open(csv_file, mode='w', newline='') as file:
        writer = csv.writer(file)
        header_written = False
        rows = []

        for i, (event, elem) in tqdm(enumerate(context)):
            if event == 'end' and elem.tag == 'row':
                if not header_written:
                    header = list(elem.attrib.keys())
                    writer.writerow(header)
                    header_written = True
                
                rows.append(list(elem.attrib.values()))
                
                if len(rows) >= chunk_size:
                    writer.writerows(rows)
                    rows = []

                root.clear()

        if rows:
            writer.writerows(rows)

In [None]:
xml_file = f'{data_folder}/raw/stack_overflow/Badges.xml'
csv_file = f'{data_folder}/raw/stack_overflow/badges.csv'
xml_to_csv_chunked(xml_file, csv_file)

### Loading Data

In [None]:
df_badges = pd.read_csv(f"{data_folder}/raw/stack_overflow/badges.csv")

In [None]:
df_badges

In [None]:
df_badges.info()

### Preprocessing Data

In [None]:
# Drop NaNs and duplicates
df_badges = df_badges.dropna(subset=['UserId', 'Date', 'Name'])\
    .drop_duplicates(subset=['UserId', 'Date', 'Name'], keep='first')

In [None]:
# Drop tag-affiliated badges
df_badges = df_badges[~df_badges['TagBased']].copy()
# Transform the date times
df_badges['Date'] = pd.to_datetime(df_badges['Date'])

In [None]:
df_badges['Date'].describe()

In [None]:
df_badges['Name'].nunique()

In [None]:
# Select 2-year data
df_badges = df_badges[('2022-01-01' <= df_badges['Date']) & (df_badges['Date'] < '2024-01-01')]

In [None]:
df_badges['Date'].describe()

### Selecting Badges

Badges:

* Awarded multiple times: Buzz, Socratic, Enlightened, Guru, Lifejacket, Lifeboat, Nice Answer, Good Answer, Great Answer, Populist, Reversal (retired), Revival, Necromancer, Activist, Campaigner, Founder, Good Question, Great Question, Grassroots, Movement, Nice Question, Promoter, Revolution, Steward, Caucus, Constituent, Yearling, Not a Robot

* Awarded once per question: Favorite Question, Stellar Question, Nice Question, Good Question, Great Question, Popular Question, Notable Question, Famous Question

* Awarded once per answer: Favorite Answer, Stellar Answer

* Awarded once per review queue: Custodian, Reviewer

* Awarded once per post: Announcer, Booster, Publicist

In [None]:
badge_list = [
    "Buzz", "Socratic", "Enlightened", "Guru", "Lifejacket", "Lifeboat",
    "Nice Answer", "Good Answer", "Great Answer", "Populist", "Revival",
    "Necromancer", "Activist", "Campaigner", "Founder", "Good Question",
    "Great Question", "Grassroots", "Movement", "Nice Question", "Promoter",
    "Revolution", "Steward", "Caucus", "Constituent", "Yearling", "Not a Robot",
    "Favorite Question", "Stellar Question", "Nice Question", "Good Question",
    "Great Question", "Popular Question", "Notable Question", "Famous Question",
    "Favorite Answer", "Stellar Answer", "Custodian", "Reviewer", "Announcer",
    "Booster", "Publicist",
]
badge_list = set(badge_list)
len(badge_list)

In [None]:
# Select badges that can be awarded multiple times
df_badges = df_badges[df_badges["Name"].isin(badge_list)]
len(df_badges)

### Selecting Users

In [None]:
user_badge_counts = df_badges.groupby('UserId')['Name'].count()

In [None]:
user_badge_counts.describe().astype(int)

In [None]:
# Select users who have earned at 40-100 badges
user_list = user_badge_counts[(user_badge_counts >= 40) & (user_badge_counts <= 100)].index
len(user_list)

In [None]:
df_badges = df_badges[df_badges["UserId"].isin(user_list)]
len(df_badges)

### Selecting Badges Again

In [None]:
badge_type_counts = df_badges['Name'].value_counts()

In [None]:
# Select badges which have been awarded at least 200 times
badge_type_list = badge_type_counts[badge_type_counts >= 200].index
len(badge_type_list)

In [None]:
df_badges = df_badges[df_badges["Name"].isin(badge_type_list)]
len(df_badges)

### Splitting Sequences

In [None]:
df_badges.groupby('UserId')['Date'].count().describe()

In [None]:
df_badges['Name'].value_counts()

In [None]:
def get_seq_splits(df, seq_col):
    seq_ids = df[seq_col].unique().tolist()
    seq_ids_train, seq_ids_val_test = train_test_split(seq_ids, train_size=0.8, random_state=0)
    seq_ids_val, seq_ids_test = train_test_split(seq_ids_val_test, train_size=0.5, random_state=0)
    seq_splits = {seq_id: 'train' for seq_id in seq_ids_train}
    seq_splits.update({seq_id: 'dev' for seq_id in seq_ids_val})
    seq_splits.update({seq_id: 'test' for seq_id in seq_ids_test})
    print(f'train: {len(seq_ids_train)} seqs, val: {len(seq_ids_val)} seqs, test: {len(seq_ids_test)} seqs')
    return seq_splits

In [None]:
badge_seq_splits = get_seq_splits(df=df_badges, seq_col='UserId')
len(badge_seq_splits)

### Saving Sequences

In [None]:
def save_seqs(
    df: pd.DataFrame, seq_col: str, seq_splits: dict,
    time_col: str, time_unit: float, type_col: str, seq_folder: str):
    """
    Save event sequences
    """
    dim_process = df[type_col].nunique()
    type_text2id = {type_text: type_id for type_id, type_text in enumerate(df[type_col].unique())}
    type_id2text = {type_id: type_text for type_text, type_id in type_text2id.items()}
    type_id_col = f'{type_col}_id'
    df[type_id_col] = df[type_col].map(type_text2id)
    data = {'train': [], 'dev': [], 'test': []}
    print(f'type_id2text: {type_id2text}')
    
    for seq_id, group in tqdm(df.groupby(seq_col)):
        group = group.sort_values(by=time_col).reset_index()
        split = seq_splits[seq_id]
        init_time = group[time_col].min()
        pre_event_time = init_time
        event_seq = {
            'dim_process': dim_process,
            'seq_idx': len(data[split]),
            'seq_len': len(group),
            'time_since_start': [],
            'time_since_last_event': [],
            'type_event': [],
            'type_text': [],
        }
        
        for index, row in group.iterrows():
            event_time = pd.to_datetime(row[time_col])
            time_since_start = (event_time - init_time).total_seconds() / time_unit
            time_since_last_event = (event_time - pre_event_time).total_seconds() / time_unit
            event_seq['time_since_start'].append(time_since_start)
            event_seq['time_since_last_event'].append(time_since_last_event)
            event_seq['type_event'].append(row[type_id_col])
            event_seq['type_text'].append(row[type_col])
            pre_event_time = event_time
        
        data[split].append(event_seq)

    os.makedirs(seq_folder, exist_ok=True)
    for split in ['train', 'dev', 'test']:
        json_path = f'{seq_folder}/{split}.json'
        with open(json_path, 'w') as file:
            json.dump(data[split], file, indent=4)
        print(f'{split} saved to {json_path}')

In [None]:
save_seqs(
    df=df_badges, seq_col='UserId', seq_splits=badge_seq_splits,
    time_col='Date', time_unit=60*60*24*30, type_col='Name',
    seq_folder=f'{data_folder}/stack_overflow',
)

## Chicago Crimes

Download the data from [Crimes - 2001 to Present](https://data.cityofchicago.org/Public-Safety/Crimes-2001-to-Present/ijzp-q8t2/about_data) to `data/raw/chicago_crime/Crimes_-_2001_to_Present.csv`.

### Loading Data

In [None]:
df_crimes = pd.read_csv(f'{data_folder}/raw/chicago_crime/Crimes_-_2001_to_Present.csv')

In [None]:
df_crimes

In [None]:
df_crimes.info()

### Preprocessing Data

In [None]:
df_crimes['Date'] = pd.to_datetime(df_crimes['Date'])
df_crimes['Primary Type'] = df_crimes['Primary Type'].str.title()

In [None]:
df_crimes = df_crimes.dropna(subset=['Date', 'Block', 'Primary Type'])\
    .drop_duplicates(subset=['Date', 'Block', 'Primary Type'], keep='first')

In [None]:
df_crimes = df_crimes[('2022-01-01' <= df_crimes['Date']) & (df_crimes['Date'] < '2024-01-01')]
len(df_crimes)

### Selecting Crimes

In [None]:
crime_counts = df_crimes['Primary Type'].value_counts()

In [None]:
crime_counts

In [None]:
crime_list = crime_counts[crime_counts >= 500].index
len(crime_list)

In [None]:
df_crimes = df_crimes[df_crimes['Primary Type'].isin(crime_list)]
len(df_crimes)

### Selecting Blocks

In [None]:
block_counts = df_crimes['Block'].value_counts()

In [None]:
block_counts

In [None]:
block_list = block_counts[(30 <= block_counts) & (block_counts <= 120)].index
len(block_list)

In [None]:
df_crimes = df_crimes[df_crimes['Block'].isin(block_list)]
len(df_crimes)

In [None]:
df_crimes['Primary Type'].nunique()

In [None]:
df_crimes['Primary Type'].value_counts()

### Saving Sequences

In [None]:
df_crimes.groupby('Block')['Date'].count().describe()

In [None]:
crime_seq_splits = get_seq_splits(df=df_crimes, seq_col='Block')
len(crime_seq_splits)

In [None]:
save_seqs(
    df=df_crimes, seq_col='Block', seq_splits=crime_seq_splits,
    time_col='Date', time_unit=60*60*24*30, type_col='Primary Type',
    seq_folder=f'{data_folder}/chicago_crime',
)

## NYC Taxi Trips

Download the [NYC Taxi Trips](https://www.andresmh.com/nyctaxitrips/) to `data/raw/nyc_taxi/` and [NYC Borough Boundaries](https://data.cityofnewyork.us/City-Government/Borough-Boundaries/tqmj-j8zm) ("Export" then "Original") to `data/raw/nyc_taxi/nybb_24c/`.

### Loading Data

In [None]:
df_trips = pd.read_csv(f'{data_folder}/raw/nyc_taxi/trip_data_5.csv')

In [None]:
df_trips.info()

In [None]:
df_trips.head()

### Preprocessing Data

In [None]:
df_trips.columns

In [None]:
df_trips.columns = df_trips.columns.str.strip()
df_trips.columns

In [None]:
df_trips = df_trips.dropna(subset=[
    'hack_license', 'pickup_datetime', 'dropoff_datetime', 
    'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
]).drop_duplicates(subset=[
    'hack_license', 'pickup_datetime', 'dropoff_datetime', 
    'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
], keep='first')

In [None]:
df_trips = df_trips[
    (df_trips['pickup_longitude'] != 0) & (df_trips['pickup_latitude'] != 0)
    & (df_trips['dropoff_longitude'] != 0) & (df_trips['dropoff_latitude'] != 0)
]

### Selecting Pickup Times

In [None]:
df_trips["pickup_datetime"] = pd.to_datetime(df_trips["pickup_datetime"])
df_trips["dropoff_datetime"] = pd.to_datetime(df_trips["dropoff_datetime"])

In [None]:
df_trips.pickup_datetime.describe()

In [None]:
df_trips.dropoff_datetime.describe()

In [None]:
df_trips = df_trips[(df_trips.pickup_datetime >= "2013-05-01") & (df_trips.pickup_datetime < "2013-05-08")]

In [None]:
df_trips.pickup_datetime.describe()

### Loading Boroughs

In [None]:
import geopandas as gpd
from shapely.geometry import Point

In [None]:
gdf_boroughs = gpd.read_file(f'{data_folder}/raw/nyc_taxi/nybb_24c/')
print("Boroughs CRS:", gdf_boroughs.crs)
gdf_boroughs

### Getting Boroughs

In [None]:
df_trips['pickup_geometry'] = df_trips.apply(lambda x: Point((x['pickup_longitude'], x['pickup_latitude'])), axis=1)
df_trips['dropoff_geometry'] = df_trips.apply(lambda x: Point((x['dropoff_longitude'], x['dropoff_latitude'])), axis=1)

In [None]:
# Convert the DataFrame to a GeoDataFrame, with the correct initial CRS (EPSG:4326)
gdf_pickups = gpd.GeoDataFrame(df_trips, geometry='pickup_geometry', crs='EPSG:4326')
gdf_dropoffs = gpd.GeoDataFrame(df_trips, geometry='dropoff_geometry', crs='EPSG:4326')
# Reproject the GeoDataFrame to the CRS of the boroughs shapefile (EPSG:2263)
gdf_pickups = gdf_pickups.to_crs(gdf_boroughs.crs)
gdf_dropoffs = gdf_dropoffs.to_crs(gdf_boroughs.crs)

In [None]:
# Perform spatial join to get the boroughs
gdf_pickups = gpd.sjoin(gdf_pickups, gdf_boroughs, how='left', predicate='intersects')
gdf_dropoffs = gpd.sjoin(gdf_dropoffs, gdf_boroughs, how='left', predicate='intersects')

In [None]:
df_trips['pickup_borough'] = gdf_pickups['BoroName']
df_trips['dropoff_borough'] = gdf_dropoffs['BoroName']

### Merging Events

In [None]:
df_trips = df_trips[
    df_trips['pickup_borough'].notna() & (df_trips['pickup_borough'] != 'Staten Island') 
    & df_trips['dropoff_borough'].notna() & (df_trips['dropoff_borough'] != 'Staten Island')].copy()

In [None]:
df_trips['pickup_type'] = df_trips['pickup_borough'] + ' Pickup'
df_trips['dropoff_type'] = df_trips['dropoff_borough'] + ' Dropoff'

In [None]:
df_pickups = df_trips[['hack_license', 'pickup_datetime', 'pickup_type']]\
    .rename(columns={'pickup_datetime': 'datetime', 'pickup_type': 'type'})
df_dropoffs = df_trips[['hack_license', 'dropoff_datetime', 'dropoff_type']]\
    .rename(columns={'dropoff_datetime': 'datetime', 'dropoff_type': 'type'})
df_all_trips = pd.concat([df_pickups, df_dropoffs], ignore_index=True)\
    .sort_values(by=['hack_license', 'datetime'])

In [None]:
df_all_trips.head(6)

### Getting Sequence IDs

In [None]:
df_all_trips = df_all_trips.sort_values(by=['hack_license', 'datetime'])

In [None]:
seq_ids = []
seq_count = 0
last_time = df_all_trips.loc[0, 'datetime']
last_license = df_all_trips.loc[0, 'hack_license']
max_hours = 12

for index, row in tqdm(df_all_trips.iterrows(), total=len(df_all_trips)):
    if row["hack_license"] != last_license:
        seq_count += 1
    elif (row["datetime"] - last_time).total_seconds() / 3600 > max_hours:
        seq_count += 1
    
    seq_ids.append(seq_count)
    last_time = row["datetime"]
    last_license = row["hack_license"]

df_all_trips['seq_id'] = seq_ids

In [None]:
df_all_trips['seq_id'].value_counts().describe()

### Selecting Sequences

In [None]:
seq_counts = df_all_trips['seq_id'].value_counts()
seq_counts.describe()

In [None]:
seq_list = seq_counts[(seq_counts >= 100) & (seq_counts <= 160)]
seq_list.describe()

In [None]:
df_all_trips = df_all_trips[df_all_trips['seq_id'].isin(seq_list.index)]
len(df_all_trips)

In [None]:
df_all_trips['type'].value_counts()

### Saving Sequences

In [None]:
df_all_trips.groupby('seq_id')['datetime'].count().describe()

In [None]:
trip_seq_splits = get_seq_splits(df=df_all_trips, seq_col='seq_id')
len(trip_seq_splits)

In [None]:
save_seqs(
    df=df_all_trips, seq_col='seq_id', seq_splits=trip_seq_splits,
    time_col='datetime', time_unit=60*60, type_col='type',
    seq_folder=f'{data_folder}/nyc_taxi',
)

## US Earthquakes

### Downloading Data

Download the US earthquake data from 2020-01-01 (inclusive) to 2024-01-01 (exclusive) to `data/raw/us_earthquake`.

In [None]:
from io import StringIO
from datetime import datetime, timedelta

In [None]:
def download_earthquake_data_chunk(start_time, end_time, region):
    """Download earthquake data for a given time chunk."""
    
    # USGS Earthquake API endpoint
    url = "https://earthquake.usgs.gov/fdsnws/event/1/query"
    
    # Query parameters
    params = {
        "format": "csv",           # Output format
        "starttime": start_time,    # Start date (YYYY-MM-DD)
        "endtime": end_time,        # End date (YYYY-MM-DD)
        # "minmagnitude": min_magnitude,  # Minimum magnitude
        # "maxmagnitude": max_magnitude,  # Maximum magnitude
        "minlatitude": region["minlatitude"],  # Min latitude of region
        "maxlatitude": region["maxlatitude"],  # Max latitude
        "minlongitude": region["minlongitude"],  # Min longitude of region
        "maxlongitude": region["maxlongitude"],  # Max longitude of region
    }
    
    # Send the request
    response = requests.get(url, params=params)
    
    if response.status_code == 200:
        print(f"Data downloaded successfully for {start_time} to {end_time}.")
        return response.content
    else:
        print(f"Failed to download data for {start_time} to {end_time}. HTTP Status Code: {response.status_code}.")
        return None

In [None]:
def download_earthquake_data(start_time, end_time, region, output_file, chunk_size):
    """Download earthquake data by splitting the request into smaller chunks."""
    
    # Convert start and end times to datetime objects
    start_date = datetime.strptime(start_time, "%Y-%m-%d")
    end_date = datetime.strptime(end_time, "%Y-%m-%d")
    
    # Initialize an empty DataFrame to store all results
    all_data = pd.DataFrame()
    
    # Loop through each month in the date range
    current_start = start_date
    while current_start < end_date:
        # Define the end of the current month
        current_end = (current_start + timedelta(days=chunk_size))
        if current_end > end_date:
            current_end = end_date
        
        # Download data for the current month
        data_chunk = download_earthquake_data_chunk(
            current_start.strftime("%Y-%m-%d"), current_end.strftime("%Y-%m-%d"), region)
        
        # If data is returned, append it to the main DataFrame
        if data_chunk:
            chunk_df = pd.read_csv(StringIO(data_chunk.decode('utf-8')))
            all_data = pd.concat([all_data, chunk_df], ignore_index=True)
        
        # Move to the next month
        current_start = current_end
    
    # Save the complete dataset to a CSV file
    all_data.to_csv(output_file, index=False)
    print(f"Data downloaded successfully and saved to {output_file}.")

In [None]:
# Parameters for the earthquake search
start_time = "2020-01-01"    # Start date
end_time = "2024-01-01"      # End date
region = {
    "minlatitude": 24.6,     # Min latitude of the region
    "maxlatitude": 50.0,     # Max latitude
    "minlongitude": -125.0,  # Min longitude
    "maxlongitude": -65.0    # Max longitude
}
output_file = f"{data_folder}/raw/us_earthquake/us_earthquakes.csv"

# Download the earthquake data
download_earthquake_data(start_time, end_time, region, output_file, chunk_size=30)

### Loading Data

In [None]:
df_earthquakes = pd.read_csv(f'{data_folder}/raw/us_earthquake/us_earthquakes.csv')

In [None]:
df_earthquakes.info()

In [None]:
pd.to_datetime(df_earthquakes.time).describe()

### Preprocessing Data

In [None]:
df_earthquakes = df_earthquakes[
    (df_earthquakes["type"] == "earthquake") & (df_earthquakes["status"] == "reviewed") 
    & (df_earthquakes['magType'] == 'ml')]
df_earthquakes = df_earthquakes.dropna(subset=['time', 'latitude', 'longitude', 'mag'])\
    .drop_duplicates(subset=['time', 'latitude', 'longitude', 'mag'], keep='first')
df_earthquakes["time"] = pd.to_datetime(df_earthquakes["time"])
df_earthquakes['coordinate'] = df_earthquakes.apply(
    lambda row: (round(row['latitude']), round(row['longitude'])), axis=1)

In [None]:
df_earthquakes['coordinate'].value_counts()

### Getting Sequence IDs

In [None]:
df_earthquakes = df_earthquakes.sort_values(by=["coordinate", "time"]).reset_index(drop=True)

In [None]:
seq_ids = []
seq_count = 0
last_time = df_earthquakes.loc[0, 'time']
last_coord = df_earthquakes.loc[0, 'coordinate']
max_hours = 24

for index, row in tqdm(df_earthquakes.iterrows(), total=len(df_earthquakes)):
    if row["coordinate"] != last_coord:
        seq_count += 1
    elif (row["time"] - last_time).total_seconds() / 3600 > max_hours:
        seq_count += 1
    
    seq_ids.append(seq_count)
    last_time = row["time"]
    last_coord = row["coordinate"]

df_earthquakes["seq_id"] = seq_ids

In [None]:
df_earthquakes.head()

### Selecting Sequences

In [None]:
df_earthquakes["seq_id"].value_counts().describe()

In [None]:
earthquake_counts = df_earthquakes["seq_id"].value_counts()
earthquake_list = earthquake_counts[(earthquake_counts >= 5) & (earthquake_counts <= 30)].index
len(earthquake_list)

In [None]:
df_earthquakes = df_earthquakes[df_earthquakes["seq_id"].isin(earthquake_list)]

In [None]:
df_earthquakes.groupby('seq_id')['time'].count().describe()

### Setting Event Types

In [None]:
df_earthquakes['mag'].describe()

In [None]:
df_earthquakes["type"] = "Small"
df_earthquakes.loc[df_earthquakes["mag"] >= 1, "type"] = "Medium"
df_earthquakes.loc[df_earthquakes["mag"] >= 2, "type"] = "Large"
df_earthquakes["type"].value_counts()

### Saving Sequences

In [None]:
df_earthquakes.groupby('seq_id')['time'].count().describe()

In [None]:
earthquake_seq_splits = get_seq_splits(df=df_earthquakes, seq_col='seq_id')
len(earthquake_seq_splits)

In [None]:
save_seqs(
    df=df_earthquakes, seq_col='seq_id', seq_splits=earthquake_seq_splits,
    time_col='time', time_unit=60*60*24, type_col='type',
    seq_folder=f'{data_folder}/us_earthquake',
)

## Amazon Reviews

Download the 29 small subsets (ratings only) of [Amazon Review Data](https://nijianmo.github.io/amazon/) to the folder `data/raw/amazon_review/`. Make sure to remave `AMAZON_FASHION.csv` to `Amazon_Fashion.csv`.

### Loading Data

In [None]:
amazon_review_folder = f"{data_folder}/raw/amazon_review"
dfs_reviews = []

for file_name in tqdm(os.listdir(amazon_review_folder)):
    if file_name.endswith('.csv'):
        file_path = os.path.join(amazon_review_folder, file_name)
        df_reviews_each = pd.read_csv(file_path, names=["item", "user", "rating", "timestamp"])
        df_reviews_each['category'] = file_name.replace('.csv', '').replace('_', ' ')
        dfs_reviews.append(df_reviews_each)

df_reviews = pd.concat(dfs_reviews, ignore_index=True)

In [None]:
df_reviews

### Preprocessing Data

In [None]:
df_reviews['date'] = pd.to_datetime(df_reviews['timestamp'], unit='s')

In [None]:
df_reviews['date'].describe()

In [None]:
df_reviews = df_reviews[("2018-01-01" <= df_reviews["date"]) & (df_reviews["date"] <= "2018-06-30")]

In [None]:
df_reviews = df_reviews.dropna(subset=['date', 'user', 'category'])\
    .drop_duplicates(subset=['date', 'user', 'category'], keep='first')

In [None]:
len(df_reviews)

### Selecting Categories

In [None]:
category_review_counts = df_reviews["category"].value_counts()
category_review_counts

In [None]:
len(category_review_counts)

In [None]:
category_list = category_review_counts[category_review_counts >= 100000].index
len(category_list)

In [None]:
# df_reviews = df_reviews[df_reviews["category"].isin(category_list)]
df_reviews.loc[~df_reviews["category"].isin(category_list), "category"] = "Other"
len(df_reviews)

### Selecting Users

In [None]:
user_review_counts = df_reviews["user"].value_counts()
user_review_counts

In [None]:
user_list = user_review_counts[(user_review_counts >= 40) & (user_review_counts <= 200)].index
len(user_list)

In [None]:
df_reviews[df_reviews["user"].isin(user_list)]["category"].value_counts()

In [None]:
df_reviews = df_reviews[df_reviews["user"].isin(user_list)]
len(df_reviews)

### Saving Sequences

In [None]:
df_reviews["user"].nunique(), df_reviews["category"].nunique()

In [None]:
df_reviews["category"].value_counts()

In [None]:
df_reviews.groupby('user')['date'].count().describe()

In [None]:
review_seq_splits = get_seq_splits(df=df_reviews, seq_col='user')
len(review_seq_splits)

In [None]:
save_seqs(
    df=df_reviews, seq_col='user', seq_splits=review_seq_splits,
    time_col='date', time_unit=60*60*24*7, type_col='category',
    seq_folder=f'{data_folder}/amazon_review',
)