# Import Packages

In [1]:
import pandas as pd
import json
import classes_functions as cf
import numpy as np
import re
from tqdm.notebook import tqdm
from scipy.interpolate import make_interp_spline
import plotly.express as px

# Load Data

In [33]:
df = pd.DataFrame(columns=["shop", "url", "modelID", "featuresMap", "title"])

# Load brand data (https://www.kaggle.com/datasets/devsubhash/television-brands-ecommerce-dataset)
brand_df = pd.read_csv("./Data/TV_Final.csv")
brands_ref = [i.lower().strip() for i in brand_df[["Brand"]].drop_duplicates().to_numpy().flatten()] + ["nec", "insignia", "supersonic", "viewsonic", "vizio", "coby", 
                                                                                                    "naxa", "rca", "dynex", "magnavox", "sunbritetv", "avue", 
                                                                                                    "venturer", "pyle", "westinghouse", "proscan", "sceptre",
                                                                                                    "contex", "mitsubishi", 'epson', "hannspree", "curtisyoung",
                                                                                                    'hp', "seiki", "azend", "hiteker", "upstar", "optoma",
                                                                                                    "affinity", "viore", "craig", "elo", "gpx"
                                                                                                    ]

# Create the simple regex pattern to find and match brand names
brand_pattern = '|'.join(rf'\b{re.escape(word)}\b' for word in brands_ref)
brand_regex_pattern = f'({brand_pattern})'

# Reading the json as a dict
with open("./Data/TVs-all-merged.json") as json_data:
    df_dict = json.load(json_data)

# Populate dataframe with TV's
for k, v in df_dict.items():
    df = pd.concat([df, pd.DataFrame(v)], ignore_index=True)

# Prep Data

In [34]:
# Convert all titles to lower case for easier matching
df["l_title"] = df["title"].str.lower()

# Lists of words that will be replaced by "hz" and "inch" respectively
hertz_list = ['hertz', ' hz', '-hz', ' hertz']
inch_list = ['inches', ' inches', '"', ' "', '-inch', ' inch']

# Replace words from the above lists and remove hyphens
df['l_title'] = df['l_title'].replace(hertz_list, 'hz', regex=True)
df['l_title'] = df['l_title'].replace(inch_list, 'inch', regex=True)
df['l_title'] = df['l_title'].replace("-", '', regex=True)

# Remove things in parenthesis (usually not the most important information)
parenthesis_regex = r'\([^)]*\)'
sq_bracket_refex = r'/\[.*\]/'
df['l_title'] = df['l_title'].replace(parenthesis_regex, '', regex=True)
df['l_title'] = df['l_title'].replace(sq_bracket_refex, '', regex=True)

# Pattern that extracts the main part of the model words
regex_pattern = r'([a-zA-Z0-9]*(([0-9]+[^0-9, ]+)|([^0-9, ]+[0-9]+))[a-zA-Z0-9]*)'
matches = df['l_title'].str.extractall(regex_pattern).groupby(level=0)[0].apply(set).apply(lambda x: sorted(x))
df = df.merge(matches, how='left', left_index=True, right_index=True).rename(columns={0: 'main_feature_lst'})

# Next extract the brand from the titles
brand_matches = df['l_title'].str.extractall(brand_regex_pattern).groupby(level=0)[0].apply(set).apply(lambda x: sorted(x))
df = df.merge(brand_matches, how='left', left_index=True, right_index=True).rename(columns={0: 'brand'})

# Combine brand and main extract
df['appended_column'] = df.apply(lambda row: row['main_feature_lst'] + row['brand'] if not any(pd.isna(row)) else "", axis=1)
df['main_feature_w_id'] = df["appended_column"].apply(lambda x: ' '.join(map(str, x)))

# Find all the model ID's and append to the final model word
id_regex = r'\b(?=\w*\d)(?=\w*[a-zA-Z])(?![\w\d]*(?:p|hz|inch|k|d)\b)(\w+)\b'
df["matched_id"] = df["main_feature_w_id"].str.findall(id_regex).apply(lambda x: max(x, default=np.nan, key=cf.get_length))
df['main_feature'] = df['main_feature_w_id'].astype(str).str.replace(' ', '')

# Main For-Loop (to create the graphs)

In [4]:
# Loop Parameters
shingling_size = 3
hash_size = 8
train_frac = 0.63
nbr_runs = 10

# Lists to save all parameters
total_f1 = []
total_f1_star = []
bands = []
fraction_of_comparisons = []

for band_nbr in sorted(set([int(i) for i in np.logspace(1, 2.11, 100)])):

    f1_list = []
    f1_star_list = []
    runtime_list = []
    fraction_of_comparison_list = []

    for i in tqdm(range(nbr_runs)):
        f1, f1_star, confusion_matrix, all_pairs_without_dupes_test, runtime_sec, fraction_of_comparison = cf.main_run(df, train_frac, shingling_size, hash_size, band_nbr)
        f1_list.append(f1)
        f1_star_list.append(f1_star)
        runtime_list.append(runtime_sec)
        fraction_of_comparison_list.append(fraction_of_comparison)

    total_f1.append(np.median(f1_list))
    total_f1_star.append(np.median(f1_star_list))
    fraction_of_comparisons.append(np.median(fraction_of_comparison_list))
    bands.append(i)

# Plotting

In [32]:
final_plotting_df = pd.read_csv("./Data/plotting_data.csv", index_col=0)
final_plotting_df = final_plotting_df.rename(columns={'runtimime': 'runtime'})
final_plotting_df = final_plotting_df.sort_values("fractions")

final_plotting_df = final_plotting_df.groupby("fractions", as_index=False).agg({"f1": "median", "f1_star": "median", "runtime": "median"})

fig = px.line(
    x=final_plotting_df["fractions"], 
    y=final_plotting_df["runtime"].rolling(window=20).mean(), # Smoothing is applied to have better looking graphs
    labels = {
        "x": "Fraction of Comparisons",
        "y": "Runtime (s)",
    },
)

config = {
  'toImageButtonOptions': {
    'format': 'png',
    'height': 400,
    'width': 600,
    'scale':6
  }
}

fig.show(config=config)