In [52]:
import pandas as pd
import numpy as np
import json
import glob
import pandas as pd
import csv

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Spacy
import spacy
from nltk.corpus import stopwords


In [57]:
# Bag of words for each category to classify text

topics = {"fee": ["fee", "money", "price", "cost", "expensive", "cost", "charge"],
        "transaction": ["deposit", "withdrawal", "bank", "delay", "debit", "credit", "card"],
        "trust": ["company", "scam"],
        "service": ["service", "customer", "contact", "support", "email"],
        "app": ["interface", "easy", "friendly", "clean", "user"]}

In [54]:
# Pre-processing the document
all_files = ["coinbase.csv", "binance.csv", "ftx.csv", "kraken.csv"]

# Combine all the csv files into one
li = []
for filename in all_files:
    df = pd.read_csv(filename)
    li.append(df)

# Get only rows with >50 chars or about 10 words
def get_rows(row):
    row["sufficient_len"] = True if len(row["content"]) > 50 else False;
    return row;
    
frame = pd.concat(li, axis=0, ignore_index=True)
frame = frame.loc[:, ["reviewId", "content", "score", "at"]]
frame = frame.apply(get_rows, axis=1)
frame = frame.loc[frame["sufficient_len"] == True]
frame


Unnamed: 0,reviewId,content,score,at,sufficient_len
2,40e56de6-c266-446a-89a6-5191a72324e8,Don't like to do update needs to go back to th...,4,2022-07-01 10:07:37,True
3,7bbae22c-e255-478e-aa79-078104b23046,DO NOT RECOMMEND - They have no problem taking...,1,2022-07-01 09:11:51,True
4,c592afe5-b785-49f7-a760-c663f516d303,Was liking it all the way up til the part wher...,1,2022-07-01 08:56:03,True
5,4d1e73a6-5209-4e16-bfd2-008da201c5c5,Not your keys not your crypto. They're also se...,1,2022-07-01 08:48:22,True
8,d1dd5332-fc37-40bd-a499-d23af117acf7,You guys should create another payment mood ap...,3,2022-07-01 07:12:55,True
...,...,...,...,...,...
19877,ffac6965-3de9-4d5b-9c36-5426d925dd12,Slick design and easy to use. Still have to us...,5,2020-11-17 19:11:44,True
19878,5d11a98a-6989-4244-85f2-be72ac295040,"Clean, simple to use app, PLEASE PUT A FIELD W...",5,2020-11-17 18:57:57,True
19879,e6a22ec1-d0ca-455a-b2aa-28b799fc0972,"Unbelievable. This is by far the most easy, us...",5,2020-11-17 18:31:37,True
19880,2ba1ad26-be94-462a-a5df-8d62224abe73,No crypto deposit/withdraw options. Useless. H...,1,2020-11-16 13:30:54,True


In [55]:
# Process documents and insert to df

spacy.load
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
allowed_postags=["NOUN", "ADJ"]
stop_words=stopwords.words("english")
def insert_processed_doc(row):

    # Lemmatization
    text = row["content"]
    doc = nlp(text) # Create document from the raw text/string
    new_text = []
    for token in doc:
        if token.pos_ in allowed_postags and token.pos_ not in stop_words:
            new_text.append(token.lemma_) # Appends only lemma to new_text
    lemma = " ".join(new_text)
 
    # Remove marks and make everything lowercase
    final_doc = gensim.utils.simple_preprocess(lemma, deacc=True);

    row["content_proc"] = final_doc
    return row

frame = frame.apply(insert_processed_doc, axis=1)

frame


Unnamed: 0,reviewId,content,score,at,sufficient_len,content_proc
2,40e56de6-c266-446a-89a6-5191a72324e8,Don't like to do update needs to go back to th...,4,2022-07-01 10:07:37,True,"[need, old, format, much, time, gainer, loser]"
3,7bbae22c-e255-478e-aa79-078104b23046,DO NOT RECOMMEND - They have no problem taking...,1,2022-07-01 09:11:51,True,"[problem, money, crypto, suspicious, customer,..."
4,c592afe5-b785-49f7-a760-c663f516d303,Was liking it all the way up til the part wher...,1,2022-07-01 08:56:03,True,"[way, part, info, ice, bueno]"
5,4d1e73a6-5209-4e16-bfd2-008da201c5c5,Not your keys not your crypto. They're also se...,1,2022-07-01 08:48:22,True,"[key, crypto, location, datum, enjoy]"
8,d1dd5332-fc37-40bd-a499-d23af117acf7,You guys should create another payment mood ap...,3,2022-07-01 07:12:55,True,"[guy, payment, mood, debit, card]"
...,...,...,...,...,...,...
19877,ffac6965-3de9-4d5b-9c36-5426d925dd12,Slick design and easy to use. Still have to us...,5,2020-11-17 19:11:44,True,"[slick, design, easy, advanced, feature, quick..."
19878,5d11a98a-6989-4244-85f2-be72ac295040,"Clean, simple to use app, PLEASE PUT A FIELD W...",5,2020-11-17 18:57:57,True,"[clean, simple, app, field, price, market, pri..."
19879,e6a22ec1-d0ca-455a-b2aa-28b799fc0972,"Unbelievable. This is by far the most easy, us...",5,2020-11-17 18:31:37,True,"[unbelievable, easy, user, friendly, interface..."
19880,2ba1ad26-be94-462a-a5df-8d62224abe73,No crypto deposit/withdraw options. Useless. H...,1,2020-11-16 13:30:54,True,"[crypto, deposit, option, useless, browser, po..."


In [62]:
# Returns the topic which the processed_doc contains the most words from
def get_topic(topics, processed_doc):
    topic_scores = dict.fromkeys(topics, 0)
    for word in processed_doc:
        for topic in topics.keys():
            if word in topics[topic]: topic_scores[topic] += 1
    topic_max = max(topic_scores, key=topic_scores.get)  
    if topic_scores[topic_max] == 0: topic_max = "N"     
    return (topic_max, topic_scores[topic_max] if topic_max != "N" else 0)

In [66]:
def insert_topic(row):
    topic_max, topic_score = get_topic(topics, row["content_proc"])
    row["topic"] = topic_max
    row["topic_score"] = topic_score
    return row

frame = frame.apply(insert_topic, axis=1)

In [67]:
frame.loc[frame["topic"] != "N"]

Unnamed: 0,reviewId,content,score,at,sufficient_len,content_proc,topic,topic_score
3,7bbae22c-e255-478e-aa79-078104b23046,DO NOT RECOMMEND - They have no problem taking...,1,2022-07-01 09:11:51,True,"[problem, money, crypto, suspicious, customer,...",fee,2
8,d1dd5332-fc37-40bd-a499-d23af117acf7,You guys should create another payment mood ap...,3,2022-07-01 07:12:55,True,"[guy, payment, mood, debit, card]",transaction,1
10,273ef3ba-4b5a-4ee8-81f3-08fc7adeda89,"Terrible, unethical, useless support team. Jus...",1,2022-07-01 06:37:10,True,"[terrible, unethical, useless, support, team, ...",trust,1
12,9e8114f4-7b1c-480e-b06a-62c613ca6be6,Good bye coinbase - I will not miss your high ...,1,2022-07-01 06:08:08,True,"[coinbase, high, fee, par, chart, decision, ot...",fee,1
14,8bd52842-28da-43dd-8386-382ae169521e,They do not support the privacy of their users...,1,2022-07-01 05:32:47,True,"[privacy, user, developer, application, datum,...",app,1
...,...,...,...,...,...,...,...,...
19877,ffac6965-3de9-4d5b-9c36-5426d925dd12,Slick design and easy to use. Still have to us...,5,2020-11-17 19:11:44,True,"[slick, design, easy, advanced, feature, quick...",app,1
19878,5d11a98a-6989-4244-85f2-be72ac295040,"Clean, simple to use app, PLEASE PUT A FIELD W...",5,2020-11-17 18:57:57,True,"[clean, simple, app, field, price, market, pri...",fee,3
19879,e6a22ec1-d0ca-455a-b2aa-28b799fc0972,"Unbelievable. This is by far the most easy, us...",5,2020-11-17 18:31:37,True,"[unbelievable, easy, user, friendly, interface...",app,4
19880,2ba1ad26-be94-462a-a5df-8d62224abe73,No crypto deposit/withdraw options. Useless. H...,1,2020-11-16 13:30:54,True,"[crypto, deposit, option, useless, browser, po...",transaction,1


In [70]:
frame["topic"].value_counts()

N              5261
fee            1430
service         964
app             691
transaction     406
trust           191
Name: topic, dtype: int64

Unnamed: 0,reviewId,content,score,at,sufficient_len,content_proc,topic,topic_score
8,d1dd5332-fc37-40bd-a499-d23af117acf7,You guys should create another payment mood ap...,3,2022-07-01 07:12:55,True,"[guy, payment, mood, debit, card]",transaction,1
20,61ee021b-4de0-47c8-9718-2bf5a062574c,Fraud app I didn't get deposit please don't us...,1,2022-07-01 03:52:58,True,"[fraud, app, deposit]",transaction,1
36,a38d9306-03dc-4997-a03c-ec7775e7cfe8,Have been unable to add my bank card to my wal...,1,2022-07-01 00:12:41,True,"[unable, bank, card, wallet, week, card, issue...",transaction,1
47,e10da6c0-9f71-4bad-b42b-e6b39959a4fc,Used to love this apps very much. Updated this...,1,2022-06-30 19:52:24,True,"[app, disappointment, earning, mandatory, paym...",transaction,2
99,870e70dd-cf8a-48c4-93b6-334755fd73ae,How to deposit BEP20 USDT in Coinbase Please h...,1,2022-06-29 19:36:17,True,[deposit],transaction,1
...,...,...,...,...,...,...,...,...
19835,073db92c-562c-4094-bf0b-782d4e40e9e4,How do i know that this is official kraken app...,3,2021-01-14 05:33:24,True,"[official, app, app, direct, deposit, credit, ...",transaction,3
19860,18bc2d93-f042-4f64-9b10-3c30072712e8,"No credit card deposits, lie. Not have any inf...",1,2020-11-26 00:41:15,True,"[credit, card, deposit, info, website, bad, us...",transaction,2
19866,714d84a4-2346-4432-8431-a763625212cd,Graphically simple and beautiful; very useful ...,5,2020-11-22 23:26:32,True,"[simple, beautiful, useful, crypto, asset, sma...",transaction,1
19867,69303bed-0a3c-4ab4-a45b-ec78c3d9f33d,Looks good but no credit card funding option. ...,3,2020-11-22 04:52:06,True,"[good, credit, card, funding, option]",transaction,1
