In [77]:
import pandas as pd
import numpy as np
import json
import re
import spacy # NLP Preprocessor
from textblob import TextBlob
spacy.prefer_gpu()
pd.set_option('display.max_colwidth', 1000)

In [56]:
def get_data_from_json(path: str) -> pd.DataFrame:
    with open(path, "r") as json_file:
        json_lines = (line for line in json_file)
        rows = (
            {
                "text": data["text"],
                "comment": comment["text"],
                "score": comment["score"]
            }
            for line in json_lines
            for data in [json.loads(line)]
            for comment in data["comments"]
        )
        return pd.DataFrame(rows, columns=["text", "comment", "score"])



In [57]:
dataset_path = './datasets/ranking_train.jsonl'
df = get_data_from_json(dataset_path)

In [79]:
df.sample(3)

Unnamed: 0,text,comment,score
121161,One Database To Rule The Cloud: Salesforce Debuts Database.com,Database.com is so ugly and unprofessional looking I at first thought it was a spam site and actually had to double check the URL.,1
416555,A Criticism of JavaScript Cryptography,"I thought this was a good post, but I wasn&#x27;t impressed with the criticisms of other blog posts. Okay, perhaps I&#x27;m biased, because I wrote one of them, but how about I try to defend the other?The Matasano post is here:http:&#x2F;&#x2F;matasano.com&#x2F;articles&#x2F;javascript-cryptography&#x2F;Perhaps the most objectionable thing about the Matasano article is title. Otherwise it does a very good job of criticizing a particular way of engineering web cryptography that is, for lack of a better term, total bullshit. But is the approach criticized in the Matasano post used in the real world?Let&#x27;s try an experiment! Go to google.com and type in &quot;encrypted chat&quot;If your results are similar to mine, one of the top 3 results will be &quot;chatcrypt.com&quot;. Let&#x27;s read the &quot;How It Works?&quot; page:&gt; Most people thinks that if a website uses a HTTPS connection (especially with the green address bar) then their &quot;typed-in&quot; informations are tran...",0
83122,Safest Seat on a Plane,"One thing which this article forgot was that also the position of the emergency exit relative to your sitting place is important. If you don't sit in the max. range of 7 rows [1] to it, you are likely to not survive a crash due to people blocking each other, smoke and fire.[1] http://www.news.com.au/travel/travel-advice/how-to-survive-a...",2


### Функци для предобработки текста

In [60]:
def remove_urls(text):
    return re.sub(r'(http|www)\S+', '', text)

def remove_stops(doc):
    """
    Удаляет часто встречающиеся слова
    """
    wo_stops = []
    for token in doc:
        if not token.is_stop:
            wo_stops.append(token.text)

    wo_stops = " ".join(wo_stops)
    return wo_stops


def remove_specials(wo_stops):
    """
    Удаляет cпециальные симловы оставляя только буквы
    """
    clean_text = re.sub('[^A-Za-z0-9]+', ' ', wo_stops)
    clean_text = " ".join(
        [text for text in clean_text.split(' ') if len(text) > 2])
    return clean_text

def extract_feats(doc):
    """
    Преобразует текст к массиву слов (Извекает только существительные и глаголы)
    """
    feats = []
    for token in doc:
        if token.pos_ in ['NOUN','VERB']:
            feats.append(token.text)
    return feats

processed_strings = {}
def preprocess_text(text):
    if text in processed_strings:
        return processed_strings[text]
    
    doc = nlp(remove_urls(text))
    wo_stops = remove_stops(doc)
    clean_text = remove_specials(wo_stops)
    processed_strings[text] = clean_text
    return clean_text

text_sentiments = {}
def get_text_sentiments(text):
    if(text in text_sentiments): return text_sentiments[text]
    return round(TextBlob(text).sentiment.polarity, 5)

## Обработка текста

In [None]:
text_columns = ["text", "comment"]
test_df = df.sample(1000).copy()
test_df[text_columns] = test_df[text_columns].apply(lambda x: x.apply(preprocess_text))
test_df["post_sentiments"] = test_df["text"].apply(lambda x: get_text_sentiments(x))
test_df["comment_sentiments"] = test_df["comment"].apply(lambda x: get_text_sentiments(x))

In [76]:
test_df.sample(100)

Unnamed: 0,text,comment,score,post_sentiments,comment_sentiments
147989,Collection documents startups commonly need Privacy Policy NDA etc,excellent thank putting,4,-0.300,1.00000
173684,tablet dead challenge Apple,Microsoft spending time iPad2 discovered important things has Lightweight Superfast startup Long battery life manufacturers convertible MacBook Air class machine 600 hour battery life MBA start fast iPad think probably use iPad said laid think particularly easy PCs Windows app store thing find spend time email web key apps important frankly think Windows pretty quickly especially given app store probably 100 customers weeks,4,-0.200,0.20417
261851,Ask Going Freelance,recommend having months expenses freelancing difficult initial clients time sustainable 100 000 year Having months expenses saved pretty safe better long term projects jumping help,1,0.000,0.11667
67261,Ask Rate essay thinking,time read blog stop think Great stuck infinite recursion thinking blog thinking blogs,1,0.000,0.80000
434347,Domain dot,Can x27 work phone Safari coming opened server found interesting resolve device connection,2,0.000,0.50000
...,...,...,...,...,...
39133,Security hole found Rails,sad story comparing recent arc security hacking post delighting bug fix added community supporting thankful here rails framework like love bug fix community throwing shit fence other this immediately reminds recent pr0n story rails members community offended reasons community members the good non technical advancement rails period saw documentation project supported community backed rails eminents expecting firm strong support rails come dhh anybody else otherwise like work rails build life afford security hole treated like this ruby way rails close heart defending cases making subjective accept rails edginess mac alike metrosexualism ofense handsomeness awesomeness funky geekness cases like act normally promptly professionaly community EDIT official rails core answer posted 10hrs ago comment contains bugfix instructions disclosure copying Disclosure Notes communication difficulties mis understanding reporter security team vulnerability publicly disclosed websites users advised upd...,3,0.000,0.06722
182972,Ask CMS static brochure ware style small business websites,evangelist ExpressionEngine Open architecture built CodeIgniter Lots add ons free easy tag based template language,2,0.125,0.27778
435990,OMG Markdown,fact charade word quot standard quot seemingly takes control away Gruber rubs wrong way Jeff Atwood praise Markdown They x27 worked hard flesh starting point standardized Markdown implementation think word chose quot Standard quot makes perfect sense Instead John Gruber Marco Arment acting super immature situation Reading Tweets today imagining stomping feet ground didn x27 approve don x27 mean add fuel fire think shouldn x27 validating overly dramatic behavior x27 sure Gruber politely sent Atwood email public tweet asking project reconsider better maybe calling like quot Rockdown quot x27 read far Atwood want Gruber board got crickets instead,0,0.000,0.06399
374538,Dell XPS Developer Edition review,hours isn x27 great x27 shitty quot laptop Haswell,3,0.000,0.80000
