In [2]:
import nltk

from IPython.display import clear_output
import ast
import time

In [3]:
import os
import yaml

def get_login_info():
    YAML_FILE_PATH = '../../credentials/login.yaml'
    with open(YAML_FILE_PATH) as file:
    # The FullLoader parameter handles the conversion from YAML
    # scalar values to Python the dictionary format
        user_info = yaml.load(file, Loader=yaml.Loader)
    
    return user_info['mongo-db']

In [4]:
from pymongo import MongoClient
# pprint library is used to make the output look more pretty
from pprint import pprint
from random import randint

def connect_mongo(database_name,collection_name):
    
    login_dict = get_login_info()
    ## info
    mongoDbUser=login_dict['user']
    db_name = database_name
    mongoDbPwd=login_dict['password']
    mongo_url = "mongodb+srv://{user_name}:{pwd}@cluster0.gwrcx.gcp.mongodb.net/{dbname}?retryWrites=true&w=majority".format(user_name=mongoDbUser,pwd=mongoDbPwd,dbname=db_name)
    client = MongoClient(mongo_url)
    db=getattr(client,database_name)
    mongo_collections = getattr(db,collection_name)
    
    return mongo_collections

conn = connect_mongo('forums','tableTennisDaily') # connect to mongodb
myquery = {} # sample query
mydoc = conn.find(myquery) # sample result, return only 1

In [5]:
import pandas as pd
import re

def generate_df_from_result(result):
    """
    input: mongodb query result
    output: dataframe
    """
    ids = []
    titles = []
    replies = []

    for idx,val in enumerate(result):
        # add to the lists
        ids.append(val['_id'])
        titles.append(val['title'])
        replies_arr = []

        for reply in val['replies'][1:]: # don't want the original post asking the question so use [1:]
            text = reply['clean_text']#.replace('\n','.') # want to get rid of the \n\n
            replies_arr.append(text)
        replies.append(replies_arr)
    df = pd.DataFrame({'id':ids,'thread_title':titles,'reply':replies})
    df = df.explode('reply', ignore_index=True) # explode() to go from item in arr to row  
    return df

In [6]:
# filter thread titles containing "vs"
vs = re.compile("vs",re.IGNORECASE)
vs_result = conn.find({"title":vs},{"title":1,"replies.clean_text":1})

df = generate_df_from_result(vs_result)

# remove entries with NaN in 'reply'
mask = [isinstance(item, (str, bytes)) for item in df['reply']]
df = df.loc[mask]  

In [7]:
df.head()

Unnamed: 0,id,thread_title,reply
0,5f686bc7b59794ca78c70902,Tenergy 05FX vs Tenergy 64FX,"\n\nHi stao, the response of the T64fx changes..."
1,5f686bc8b59794ca78c70903,Butterfly Impartial XS vs 802-40 vs spinlord w...,\n\nI'm also interested.\n\n
2,5f686bc8b59794ca78c70903,Butterfly Impartial XS vs 802-40 vs spinlord w...,\n\nmaybe TTD can start reviewing some sp rubb...
3,5f686bc8b59794ca78c70903,Butterfly Impartial XS vs 802-40 vs spinlord w...,\n\nI do not know if tabletennisdaily can reiv...
4,5f686bc8b59794ca78c70903,Butterfly Impartial XS vs 802-40 vs spinlord w...,\n\nI`ve used to play for several months with ...


In [None]:
# df.thread_title.unique()

In [8]:
# sentence tokenization
# treats newline character \n as a sentence break
def tokenize_sent(s):
    s_split = s.split('\n') # ensures that \n is taken as a splitter
    s_split = [t for t in s_split if t]
    tokenized = list(map(nltk.sent_tokenize, s_split)) # list of lists
    return [t for sub in tokenized for t in sub] # flatten out

In [29]:
df_s = df.copy()
df_s['reply_split'] = df_s.reply.apply(tokenize_sent)
df_s = df_s.explode('reply_split',ignore_index=True)
df_s = df_s.drop('reply', axis=1)

In [30]:
df_s.head()

Unnamed: 0,id,thread_title,reply_split
0,5f686bc7b59794ca78c70902,Tenergy 05FX vs Tenergy 64FX,"Hi stao, the response of the T64fx changes dep..."
1,5f686bc7b59794ca78c70902,Tenergy 05FX vs Tenergy 64FX,Usually I have a Primorac Carbon and I'm offen...
2,5f686bc7b59794ca78c70902,Tenergy 05FX vs Tenergy 64FX,I used T05fx before and I really appreciate th...
3,5f686bc7b59794ca78c70902,Tenergy 05FX vs Tenergy 64FX,On the other hand less spin too.
4,5f686bc7b59794ca78c70902,Tenergy 05FX vs Tenergy 64FX,Very good for block.


In [11]:
#df_s.to_csv('test.csv')

In [32]:
# classification convention: 
# -1: has not been labelled yet
#  0: irrelevant
#  1: comparative
#  2: comparative, but need context (eg previous sentence)
#  3: non-comparative statement for single one
#  4: non-comparative statement for single one, but need context

# initiate new column for labelling the classification
df_s['classification'] = int(-1)
df_s.to_csv('labels0.csv')

In [None]:
##
df_s = pd.read_csv('labels0.csv')
ind0 = (df_s.classification == -1).idxmax()

In [13]:
track_tit = ''
b = -1
i = i0
while 1:
    row_tit = df_s.iloc[i].thread_title
    if row_tit != track_tit:
        print(row_tit)
        b = int(input('have potential?'))
        track_tit = row_tit
    if b:
        print(df_s.iloc[i]['reply_split'])
        time.sleep(0.05)
        a_in = input('classification? ')   
        df_s.at[i,'classification'] = int(a_in)
    i = i + 1

Tenergy 05FX vs Tenergy 64FX
have potential?1
Hi stao, the response of the T64fx changes depending on the the blade you're using.
classification? 0
Usually I have a Primorac Carbon and I'm offensive player with agressive top spin.
classification? 0
I used T05fx before and I really appreciate the general feeling of the T64fx, which is more softer but faster than 05fx.
classification? 1
On the other hand less spin too.
classification? 2
Very good for block.
classification? 4
For my style I really prefer T64fx than T05fx ( always 2.1 for me).
classification? 1
I bought recently the complete Garaydia family to see if from the blade I had a possibility to improve my skills and I would say no.
classification? 0
With the ALC it is too soft and slow for me, with the T5000 too fast and the ZLC has a softer touch and could be an alternative.
classification? 0
I'm truly amazed about the huge difference between those 3 blades.
classification? 0
To finish I'm currently glueing a Dignics 05 on my Pr

In [17]:
df_s.head(15)

Unnamed: 0,id,thread_title,reply_split,classification
0,5f686bc7b59794ca78c70902,Tenergy 05FX vs Tenergy 64FX,"Hi stao, the response of the T64fx changes dep...",0
1,5f686bc7b59794ca78c70902,Tenergy 05FX vs Tenergy 64FX,Usually I have a Primorac Carbon and I'm offen...,0
2,5f686bc7b59794ca78c70902,Tenergy 05FX vs Tenergy 64FX,I used T05fx before and I really appreciate th...,1
3,5f686bc7b59794ca78c70902,Tenergy 05FX vs Tenergy 64FX,On the other hand less spin too.,2
4,5f686bc7b59794ca78c70902,Tenergy 05FX vs Tenergy 64FX,Very good for block.,4
5,5f686bc7b59794ca78c70902,Tenergy 05FX vs Tenergy 64FX,For my style I really prefer T64fx than T05fx ...,1
6,5f686bc7b59794ca78c70902,Tenergy 05FX vs Tenergy 64FX,I bought recently the complete Garaydia family...,0
7,5f686bc7b59794ca78c70902,Tenergy 05FX vs Tenergy 64FX,"With the ALC it is too soft and slow for me, w...",0
8,5f686bc7b59794ca78c70902,Tenergy 05FX vs Tenergy 64FX,I'm truly amazed about the huge difference bet...,0
9,5f686bc7b59794ca78c70902,Tenergy 05FX vs Tenergy 64FX,To finish I'm currently glueing a Dignics 05 o...,0


In [21]:
df_s[df_s['classification'] != -1].to_csv('labels.csv')

In [26]:
(df_s.classification == -1).idxmax()

10

In [27]:
df_ss = pd.read_csv('test.csv')

In [28]:
df_ss.head()

Unnamed: 0.1,Unnamed: 0,id,thread_title,reply_split
0,0,5f686bc7b59794ca78c70902,Tenergy 05FX vs Tenergy 64FX,"Hi stao, the response of the T64fx changes dep..."
1,1,5f686bc7b59794ca78c70902,Tenergy 05FX vs Tenergy 64FX,Usually I have a Primorac Carbon and I'm offen...
2,2,5f686bc7b59794ca78c70902,Tenergy 05FX vs Tenergy 64FX,I used T05fx before and I really appreciate th...
3,3,5f686bc7b59794ca78c70902,Tenergy 05FX vs Tenergy 64FX,On the other hand less spin too.
4,4,5f686bc7b59794ca78c70902,Tenergy 05FX vs Tenergy 64FX,Very good for block.
