In [31]:
import numpy as np
import pandas as pd 
from pandarallel import pandarallel
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 2 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


# Train set preprocessing

In [32]:
df_raw = pd.read_json(path_or_buf='/kaggle/input/vc-it-cup-ranking/ranking_train.jsonl', lines=True)

In [33]:
posts = df_raw['text']
comments = df_raw.explode('comments')['comments'].to_frame()
comments['text'] = comments['comments'].apply(lambda x: x['text'])
comments['target'] = comments['comments'].apply(lambda x: x['score'])
comments = comments.drop('comments', axis=1)

In [34]:
import re
import string

def clean_text(text):
    text = text.lower()  
    text = re.sub("[%s]" % re.escape(string.punctuation), "", text)    
    text = re.sub("([^\x00-\x7F])+", " ", text)
    return ' '.join(word_tokenize(text))

In [35]:
posts_cleaned = posts.parallel_map(clean_text)
comments_cleaned = comments.copy()
comments_cleaned['text'] = comments['text'].parallel_map(clean_text)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=44054), Label(value='0 / 44054')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=220268), Label(value='0 / 220268')…

In [36]:
corpus_stop = set(stopwords.words("english"))

corpus_comments = [word for i in comments_cleaned['text'].str.split().values.tolist() for word in i if (word not in corpus_stop)]

In [38]:
posts_cleaned = posts_cleaned.parallel_map(lambda x: ' '.join([i for i in x.split() if i not in corpus_stop]))

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=44054), Label(value='0 / 44054')))…

In [39]:
from collections import Counter 

comments_counter = Counter(corpus_comments)

In [40]:
most_frequent_words_in_comments = [i[0] for i in comments_counter.most_common(20)]

In [42]:
stop = set(stopwords.words('english'))
stop.update(most_frequent_words_in_comments)

In [43]:
comments_cleaned['clear_text'] = comments_cleaned['text'].parallel_map(lambda x: ' '.join([i for i in x.split() if i not in stop]))

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=220268), Label(value='0 / 220268')…

In [44]:
from nltk.stem.snowball import SnowballStemmer

snow_stemmer = SnowballStemmer(language='english')

posts_stemmed = posts_cleaned.parallel_map(lambda x: ' '.join([snow_stemmer.stem(i) for i in x.split()]))

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=44054), Label(value='0 / 44054')))…

In [45]:
comments_stemmed = comments_cleaned['clear_text'].parallel_map(lambda x: ' '.join([snow_stemmer.stem(i) for i in x.split()]))

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=220268), Label(value='0 / 220268')…

In [46]:
posts_stemmed.repeat(5)

0        mani summer combin funde decid continu startup...
0        mani summer combin funde decid continu startup...
0        mani summer combin funde decid continu startup...
0        mani summer combin funde decid continu startup...
0        mani summer combin funde decid continu startup...
                               ...                        
88106    pay rent credit debit card landlord signup requir
88106    pay rent credit debit card landlord signup requir
88106    pay rent credit debit card landlord signup requir
88106    pay rent credit debit card landlord signup requir
88106    pay rent credit debit card landlord signup requir
Name: text, Length: 440535, dtype: object

In [47]:
preprocessed_dataset = pd.DataFrame(dict(text=posts_stemmed.repeat(5), comments=comments_stemmed, score=comments['target']))

In [48]:
preprocessed_dataset = preprocessed_dataset.groupby(preprocessed_dataset.index).agg({'comments': lambda x: x.to_list(), 'score': lambda x: x.to_list()})

In [49]:
preprocessed_dataset['posts'] = posts_stemmed

In [50]:
preprocessed_dataset.to_json('train_preprocessed.json')

# Test set Preprocessing

In [65]:
df_test = pd.read_json(path_or_buf='/kaggle/input/vc-it-cup-ranking/ranking_test.jsonl', lines=True)

In [66]:
posts_test = df_test['text']
comments_test = df_test.explode('comments')['comments'].to_frame()
comments_test['text'] = comments_test['comments'].apply(lambda x: x['text'])
comments_test = comments_test.drop('comments', axis=1)

In [69]:
posts_test_cleaned = posts_test.parallel_map(clean_text)
comments_test_cleaned = comments_test.copy()
comments_test_cleaned['text'] = comments_test['text'].parallel_map(clean_text)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=7002), Label(value='0 / 7002'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=35010), Label(value='0 / 35010')))…

In [70]:
posts_test_cleaned = posts_test_cleaned.parallel_map(lambda x: ' '.join([i for i in x.split() if i not in corpus_stop]))

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=7002), Label(value='0 / 7002'))), …

In [71]:
comments_test_cleaned['clear_text'] = comments_test_cleaned['text'].parallel_map(lambda x: ' '.join([i for i in x.split() if i not in stop]))

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=35010), Label(value='0 / 35010')))…

In [72]:
posts_test_stemmed = posts_test_cleaned.parallel_map(lambda x: ' '.join([snow_stemmer.stem(i) for i in x.split()]))

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=7002), Label(value='0 / 7002'))), …

In [73]:
comments_test_stemmed = comments_test_cleaned['clear_text'].parallel_map(lambda x: ' '.join([snow_stemmer.stem(i) for i in x.split()]))

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=35010), Label(value='0 / 35010')))…

In [74]:
preprocessed_test_dataset = pd.DataFrame(dict(text=posts_test_stemmed.repeat(5), comments=comments_test_stemmed))

In [77]:
preprocessed_test_dataset = preprocessed_test_dataset.groupby(preprocessed_test_dataset.index).agg({'comments': lambda x: x.to_list()})

In [78]:
preprocessed_test_dataset['posts'] = posts_test_stemmed

In [79]:
preprocessed_test_dataset

Unnamed: 0,comments,posts
0,[ix27m still wait stabil wifi ipad sith io 8 q...,io 801 releas broken iphon 6 model withdrawn
1,[employ itx27 better cheaper marketyou allow s...,ask hn us hner get health insur
2,[donx27t understand drug develop public money ...,san diego research crowdfund patentfre cancer ...
3,[ix27m physicist imagin excit news excit possi...,rethink origin univers
4,[someon doesnx27t io develop boggl mind guy bu...,slacktextviewcontrol new grow text input io
...,...,...
13999,[meanwhil us stubb mayor town alaska 18 yearsh...,cat miaow
14000,[radic idea mayb model intellectu properti wro...,facebook piraci problem
14001,[present indepth summari ix27d love read hear ...,go gc solv latenc problem go 15
14002,[ok want quothearquot trippi neural network th...,understand neural network deep visual


In [80]:
preprocessed_test_dataset.to_json('test_preprocessed.json')