## Create Training and Test Set using labelled data

In [27]:
import nltk

In [28]:
import pandas as pd

df1 = pd.read_csv('labels0.csv',index_col=0)
df2 = pd.read_csv('and_or_labels0.csv',index_col=0)

df = pd.concat([df1,df2])
df.head()

Unnamed: 0,id,thread_title,reply_split,classification
0,5f686bc7b59794ca78c70902,Tenergy 05FX vs Tenergy 64FX,"Hi stao, the response of the T64fx changes dep...",0
1,5f686bc7b59794ca78c70902,Tenergy 05FX vs Tenergy 64FX,Usually I have a Primorac Carbon and I'm offen...,0
2,5f686bc7b59794ca78c70902,Tenergy 05FX vs Tenergy 64FX,I used T05fx before and I really appreciate th...,1
3,5f686bc7b59794ca78c70902,Tenergy 05FX vs Tenergy 64FX,On the other hand less spin too.,2
4,5f686bc7b59794ca78c70902,Tenergy 05FX vs Tenergy 64FX,Very good for block.,4


In [29]:
df.iloc[11:15,:]

Unnamed: 0,id,thread_title,reply_split,classification
11,5f686bc8b59794ca78c70903,Butterfly Impartial XS vs 802-40 vs spinlord w...,I'm also interested.,-2
12,5f686bc8b59794ca78c70903,Butterfly Impartial XS vs 802-40 vs spinlord w...,maybe TTD can start reviewing some sp rubbers?,-2
13,5f686bc8b59794ca78c70903,Butterfly Impartial XS vs 802-40 vs spinlord w...,I do not know if tabletennisdaily can reiview ...,-2
14,5f686bc8b59794ca78c70903,Butterfly Impartial XS vs 802-40 vs spinlord w...,Think they need someone that know how to play ...,-2


In [30]:
df.loc[df.classification==5,'classification'] = -1
df.loc[df.classification==-2,'classification'] = -1 

In [31]:
df.iloc[11:15,:]

Unnamed: 0,id,thread_title,reply_split,classification
11,5f686bc8b59794ca78c70903,Butterfly Impartial XS vs 802-40 vs spinlord w...,I'm also interested.,-1
12,5f686bc8b59794ca78c70903,Butterfly Impartial XS vs 802-40 vs spinlord w...,maybe TTD can start reviewing some sp rubbers?,-1
13,5f686bc8b59794ca78c70903,Butterfly Impartial XS vs 802-40 vs spinlord w...,I do not know if tabletennisdaily can reiview ...,-1
14,5f686bc8b59794ca78c70903,Butterfly Impartial XS vs 802-40 vs spinlord w...,Think they need someone that know how to play ...,-1


In [4]:
df_labelled = df[df.classification != -1]
df_labelled.head()

Unnamed: 0,id,thread_title,reply_split,classification
0,5f686bc7b59794ca78c70902,Tenergy 05FX vs Tenergy 64FX,"Hi stao, the response of the T64fx changes dep...",0
1,5f686bc7b59794ca78c70902,Tenergy 05FX vs Tenergy 64FX,Usually I have a Primorac Carbon and I'm offen...,0
2,5f686bc7b59794ca78c70902,Tenergy 05FX vs Tenergy 64FX,I used T05fx before and I really appreciate th...,1
3,5f686bc7b59794ca78c70902,Tenergy 05FX vs Tenergy 64FX,On the other hand less spin too.,2
4,5f686bc7b59794ca78c70902,Tenergy 05FX vs Tenergy 64FX,Very good for block.,4


In [80]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(df_labelled.reply_split, df_labelled.classification, test_size=0.2, stratify=df_labelled.classification)

In [81]:
df_train = pd.DataFrame()
df_train['reply_split'] = x_train
df_train['classification'] = y_train

df_test = pd.DataFrame()
df_test['reply_split'] = x_test
df_test['classification'] = y_test

In [83]:
from collections import Counter
print(Counter(y_train))
print(Counter(y_test))

Counter({0: 1624, 3: 171, 2: 156, 1: 132, 4: 97})
Counter({0: 407, 3: 43, 2: 39, 1: 33, 4: 24})


In [84]:
df_train.to_csv('roberta_training.csv')
df_test.to_csv('roberta_testing.csv')

## Create evaluation set while retaining context and order

In [32]:
df2 = df.copy()
df2 = df2.loc[:,'id':'reply_split']
df2.columns

Index(['id', 'thread_title', 'reply_split'], dtype='object')

In [33]:
df2.to_csv('roberta_eval_1_v2.csv')

## Create eval set 2 using Ooak Forum threads

In [5]:
import os
import yaml

def get_login_info():
    YAML_FILE_PATH = '../../credentials/login.yaml'
    with open(YAML_FILE_PATH) as file:
    # The FullLoader parameter handles the conversion from YAML
    # scalar values to Python the dictionary format
        user_info = yaml.load(file, Loader=yaml.Loader)
    
    return user_info['mongo-db']

In [12]:
from pymongo import MongoClient
# pprint library is used to make the output look more pretty
from pprint import pprint
from random import randint

def connect_mongo(database_name,collection_name):
    
    login_dict = get_login_info()
    ## info
    mongoDbUser=login_dict['user']
    db_name = database_name
    mongoDbPwd=login_dict['password']
    mongo_url = "mongodb+srv://{user_name}:{pwd}@cluster0.gwrcx.gcp.mongodb.net/{dbname}?retryWrites=true&w=majority".format(user_name=mongoDbUser,pwd=mongoDbPwd,dbname=db_name)
    client = MongoClient(mongo_url)
    db=getattr(client,database_name)
    mongo_collections = getattr(db,collection_name)
    
    return mongo_collections

conn = connect_mongo('forums','ooakForum') # connect to mongodb
myquery = {} # sample query
mydoc = conn.find(myquery) # sample result, return only 1

In [13]:
import pandas as pd
import re

def generate_df_from_result(result):
    """
    input: mongodb query result
    output: dataframe
    """
    ids = []
    titles = []
    replies = []

    for idx,val in enumerate(result):
        # add to the lists
        ids.append(val['_id'])
        titles.append(val['title'])
        replies_arr = []

        for reply in val['replies'][1:]: # don't want the original post asking the question so use [1:]
            text = reply['clean_text']#.replace('\n','.') # want to get rid of the \n\n
            replies_arr.append(text)
        replies.append(replies_arr)
    df = pd.DataFrame({'id':ids,'thread_title':titles,'reply':replies})
    df = df.explode('reply') # explode() to go from item in arr to row  
    return df

In [14]:
# filter thread titles containing "vs"
vs = re.compile("vs",re.IGNORECASE)
vs_result = conn.find({"title":vs},{"title":1,"replies.clean_text":1})

df = generate_df_from_result(vs_result)

# remove entries with NaN in 'reply'
mask = [isinstance(item, (str, bytes)) for item in df['reply']]
df = df.loc[mask]  

In [15]:
df.head()

Unnamed: 0,id,thread_title,reply
0,5f6c219629a893d240caed63,chop vs spin rubber -- which characteristics d...,"Going to a club may help...There, seems it's p..."
0,5f6c219629a893d240caed63,chop vs spin rubber -- which characteristics d...,Depends how you play and swing. You can serve ...
0,5f6c219629a893d240caed63,chop vs spin rubber -- which characteristics d...,A rubber consists of two parts: The sponge and...
0,5f6c219629a893d240caed63,chop vs spin rubber -- which characteristics d...,At average club level around me I know several...
0,5f6c219629a893d240caed63,chop vs spin rubber -- which characteristics d...,Also with the 38mm ball. Just more difficult t...


In [19]:
# sentence tokenization
# treats newline character \n as a sentence break
def tokenize_sent(s):
    s_split = s.split('\n') # ensures that \n is taken as a splitter
    s_split = [t for t in s_split if t]
    tokenized = list(map(nltk.sent_tokenize, s_split)) # list of lists
    return [t for sub in tokenized for t in sub] # flatten out

In [21]:
df_s = df.copy()
df_s['reply_split'] = df_s.reply.apply(tokenize_sent)
df_s = df_s.explode('reply_split')
df_s = df_s.drop('reply', axis=1)

In [23]:
ooak_df_vs = df_s

In [24]:
# find all thread title that contains OR or '/' symbol (ignore case)
regex_or = re.compile("( or )|(\/)", re.IGNORECASE)
regex_or_result = conn.find({"title":regex_or},{"title":1,"replies.clean_text":1})
df = generate_df_from_result(regex_or_result)

# remove entries with NaN in 'reply'
mask = [isinstance(item, (str, bytes)) for item in df['reply']]
df = df.loc[mask]  

In [25]:
df.head()

Unnamed: 0,id,thread_title,reply
0,5f6c218829a893d240caed5e,Inverted Rubber review format / proforma - PLE...,"This is a great idea, it will make it easier f..."
0,5f6c218829a893d240caed5e,Inverted Rubber review format / proforma - PLE...,"All for it,\nIt would make comparisons much ea..."
0,5f6c218829a893d240caed5e,Inverted Rubber review format / proforma - PLE...,Excellent haggisv. Some consistency at last in...
0,5f6c218829a893d240caed5e,Inverted Rubber review format / proforma - PLE...,Debater I think that's up to the reviewer. Any...
0,5f6c218829a893d240caed5e,Inverted Rubber review format / proforma - PLE...,So we're all happy the way it is.. no changes/...


In [26]:
df_s = df.copy()
df_s['reply_split'] = df_s.reply.apply(tokenize_sent)
df_s = df_s.explode('reply_split')
df_s = df_s.drop('reply', axis=1)

In [27]:
ooak_df_and_or = df_s

In [28]:
ooak_df_and_or

Unnamed: 0,id,thread_title,reply_split
0,5f6c218829a893d240caed5e,Inverted Rubber review format / proforma - PLE...,"This is a great idea, it will make it easier f..."
0,5f6c218829a893d240caed5e,Inverted Rubber review format / proforma - PLE...,"Thus, it will be easier to compare different e..."
0,5f6c218829a893d240caed5e,Inverted Rubber review format / proforma - PLE...,"All for it,"
0,5f6c218829a893d240caed5e,Inverted Rubber review format / proforma - PLE...,It would make comparisons much easier and also...
0,5f6c218829a893d240caed5e,Inverted Rubber review format / proforma - PLE...,Excellent haggisv.
...,...,...,...
223,5f6c384f220a4650ffffde2b,best serving rubber chinese/european?,Having said all that TSS is a good rubber for ...
223,5f6c384f220a4650ffffde2b,best serving rubber chinese/european?,"In addition you don't need to prime it, it'll ..."
224,5f6c3852220a4650ffffde2d,Kokutaku Tulpe 007 rubber or 868,i have some 007 rubber and i have testet the j...
224,5f6c3852220a4650ffffde2d,Kokutaku Tulpe 007 rubber or 868,Uli


In [29]:
ooak_df_and_or.shape

(9100, 3)

In [30]:
eval_df_2 = pd.concat([ooak_df_vs,ooak_df_and_or])
print(eval_df_2.shape)

(16033, 3)


In [31]:
eval_df_2.to_csv('roberta_eval_2.csv')

## Extract Rubber related threads on myTableTennisnet for eval set 3

In [35]:
conn = connect_mongo('forums','myTt') # connect to mongodb
regex_or = re.compile("([Tt]energy)|([Tt]05)|(MX-P)|(h3)|(FX-P)|(Hurricane 3)|(Evolution)|(ELP)|(ELS)|(Xiom Omega)", re.IGNORECASE)
regex_or_result = conn.find({"title":regex_or},{"title":1,"replies.clean_text":1})

df = generate_df_from_result(regex_or_result)

# remove entries with NaN in 'reply'
mask = [isinstance(item, (str, bytes)) for item in df['reply']]
df = df.loc[mask]  

In [36]:
df.head()

Unnamed: 0,id,thread_title,reply
0,5f7114bbbdf9b0fd5fe51e0e,Tips/Tricks/Help on equipment,\r\n Updated and added some more info...\r\n...
0,5f7114bbbdf9b0fd5fe51e0e,Tips/Tricks/Help on equipment,\nhttp://mytabletennis.net/forum/forum_posts.a...
1,5f711576bb596fe084f2597a,Experiences switching from T05 to Fastarc G1,"\r\n Hello Simon_plays,As for me G1 seems at..."
1,5f711576bb596fe084f2597a,Experiences switching from T05 to Fastarc G1,"\r\n To clarify, you put the Johnson baby oi..."
1,5f711576bb596fe084f2597a,Experiences switching from T05 to Fastarc G1,"\n\nYeah - put that to the topsheet, not to th..."


In [40]:
df_s = df.copy()
df_s['reply_split'] = df_s.reply.apply(tokenize_sent)
df_s = df_s.explode('reply_split')
df_s = df_s.drop('reply', axis=1)

In [41]:
myTt_df = df_s

In [43]:
myTt_df.shape

(90585, 3)

In [42]:
myTt_df.to_csv('roberta_eval_3.csv')