## 1. Preparation

In [None]:
import pandas as pd
from warnings import filterwarnings
import numpy as np
filterwarnings("ignore")
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity
from davood_ml_functions import *
import numpy as np
from symspellpy import SymSpell , Verbosity
import textwrap

In [2]:
df = pd.read_csv("train_data_cleaned.csv" , index_col = 0)[["y" , "asin" , "text"]]

In [3]:
df.head(5)

Unnamed: 0,y,asin,text
0,2,511189877,i have an older urc-wr7 remote and thought thi...
1,5,511189877,first time i have ever had a remote that neede...
2,4,511189877,got them and only 2 of them worked. company ca...
3,5,511189877,i got tired of the remote being on the wrong s...
4,5,594459451,after purchasing cheap cords from another webs...


## 2. Correct Missplllings

In [4]:
sym_spell = SymSpell(max_dictionary_edit_distance = 2 , prefix_length = 7)
sym_spell.load_dictionary("frequency_dictionary_en_82_765.txt" , term_index = 0 , count_index = 1)

True

In [5]:
def correct_text(text):
    if not isinstance(text , str):
        return ""
    words = text.split()
    corrected = []
    for w in words:
        suggestions = sym_spell.lookup(w , Verbosity.CLOSEST , max_edit_distance = 2)
        corrected.append(suggestions[0].term if suggestions else w)
    return " ".join(corrected)

In [21]:
df["text"] = df["text"].apply(correct_text)

## 3. Load a Word2Vec Model

In [22]:
# Link to vectors:
# https://figshare.com/ndownloader/files/41403483

In [23]:
model = KeyedVectors.load_word2vec_format(
    "GoogleNews-vectors-negative300.bin",
    binary = True
)

In [24]:
def get_comment_vector(text, model):
    tokens = text.split()
    vectors = [model[w] for w in tokens if w in model]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

## 4. Find Similar Keywords

In [26]:
keywords = ["guarantee", "warranty", "warrenty", "guarentee", "assurance", "refund", "replacement"]

In [27]:
def get_similar_words(words , model , topn = 5):
    similar = set(words)
    for w in words:
        if w in model.key_to_index:
            similar.update([x for x , _ in model.most_similar(w , topn = topn)])
    return list(similar)

In [13]:
related_words = get_similar_words(keywords , model)
related_words = [s.lower().replace("_" , " ") for s in related_words]

to_remove = ["tmo" , "is'nt" , "developments affecting siaf" , "dont" , "macbook pro"]
for S in to_remove:
    related_words.remove(S)
    
related_words = list(set(related_words))

In [28]:
print(textwrap.fill(" , ".join(related_words), width = 60))

guarenteed , replacing , limited warranty , replacment ,
replacement , reassurances , assure , assurances ,
warranties , guarantees , replacements , warranty ,
gaurantee , refund , guaranteeing , assurance , guaranteed ,
refunds , guarentee , warrantee , rebate , replace ,
lifetime warranty , guarantee , gurantee , applecare ,
warrenty , refunded


## 5. Select Comments Containing Keywords

In [29]:
df["related"] = df["text"].apply(lambda x: int(any(word in str(x) for word in related_words)))

In [30]:
df = df.loc[df["related"] == 1]
df = df.drop(columns = ["related"])

In [31]:
t1 = df.loc[838891 , "text"]
print(textwrap.fill(t1 , width = 60))

the xii series is corsairs flagship side the sad market is
pretty crowded but this sad delivers solid performance at a
fair price with a year warranty i am using it for gaming and
i am happy with it


In [32]:
t2 = df.loc[159 , "text"]
print(textwrap.fill(t2 , width = 60))

i was only able to use it just one time every time i tried
to use it again my var shut off no matter how many times i
tried to get it to clean my var shut off and the var it only
six months old i am going to try and return it and see if
the company will replace it


## 6. Find Most Popular Products and Brands

In [33]:
title = pd.read_csv("title_brand.csv" , index_col = 0)

In [None]:
pd.set_option("display.max_colwidth", None)
r1 = pd.merge(
    left = df,
    right = title[["title"]],
    on = "asin",
    how = "left"
)
r1 = r1.groupby(["asin", "title"])[["y"]].agg(count = ("y", "count"), mean = ("y", "mean"))
r1.columns = ["Number of Votes", "Average Score"]
r1 = r1.loc[r1["Number of Votes"] > 100]
r1["Average Score"] = r1["Average Score"].round(2)
r1 = r1.sort_values(by = "Average Score", ascending = False)
r1 = r1.reset_index().rename(columns = {"title": "Product Name"})[["asin", "Product Name", "Number of Votes", "Average Score"]].set_index("asin")
r1

Unnamed: 0_level_0,Product Name,Number of Votes,Average Score
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
B00OBRE5UE,Samsung 850 EVO 500GB 2.5-Inch SATA III Internal SSD (MZ-75E500B/AM),156,4.65
B0043T7FXE,"Logitech M570 Wireless Trackball Mouse &ndash; Ergonomic Design with Sculpted Right-hand Shape, Compatible with Apple Mac and Microsoft Windows Computers, USB Unifying Receiver, Dark Gray",136,4.18
B00DSUTX3O,WD Black 750GB Performance Mobile Hard Disk Drive - 7200 RPM SATA 6 Gb/s 16MB Cache 9.5 MM 2.5 Inch - WD7500BPKX,101,4.17
B0000BVYT3,"NETGEAR 5-Port Gigabit Ethernet Unmanaged Switch, Sturdy Metal, Desktop, Plug-and-Play, ProSAFE Lifetime Protection (GS105NA)",120,3.95
B004OVECU0,"Logitech Harmony 650 Infrared All in One Remote Control, Universal Remote Logitech, Programmable Remote (Silver)",121,3.94
B010OYASRG,"OontZ Angle 3 Enhanced Stereo Edition IPX5 Splashproof Portable Bluetooth Speaker with Volume Booster AMP 10 Watts Power, Custom Bass Radiator, 100' Wireless Range Bluetooth 4.2",133,3.65
B0001FTVEK,Sennheiser RS120 On-Ear Wireless RF Headphones with Charging Dock,126,3.6
B00BUSDVBQ,"TP-Link AC1750 Smart WiFi Router - Dual Band Gigabit Wireless Router, 802.11ac Internet Router, Wireless routers for home(Archer C7)",149,3.6
B01DA0YCNC,Roku Streaming Stick (3600R) - HD Streaming Player with Quad-Core Processor,104,3.49
B00S9SGNNS,"ASUS Tri-Band Gigabit (AC3200) WiFi Router (Up to 3167 Mbps) with MU-MIMO to ensure Lag-Free Gaming, AiProtection network security powered by Trend Micro, Adaptive QoS and Parental Control (RT-AC3200)",172,3.32


In [92]:
r2 = pd.merge(
    left=df,
    right=title[["brand"]],
    on="asin",
    how="left"
)
r2 = r2.groupby("brand")[["y"]].agg(count=('y', 'count'), mean=('y', 'mean'))
r2.columns = ["Number of Votes", "Average Score"]
r2 = r2.sort_values(by="Average Score", ascending=False)
r2 = r2.loc[r2["Number of Votes"] > 100]
r2["Average Score"] = r2["Average Score"].round(2)
r2.index.name = "Brand Name"
r2.head(10)

Unnamed: 0_level_0,Number of Votes,Average Score
Brand Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Noctua,350,4.76
The Friendly Swede,156,4.56
It is,151,4.5
MEKO,126,4.5
Syncwire,109,4.44
AMD,102,4.41
Bargains Depot,148,4.41
Crucial,302,4.39
Geekria,115,4.38
Powerextra,105,4.32


In [94]:
df.to_csv("selected_comments.csv")