In [82]:
import dask.bag as db
import json

folder = 'CL_Cup IT_Data_Scince_секция_кейс_VK_датасет.zip'
test_data = db.read_text(
    'zip://ranking_test.jsonl',
    storage_options={'fo': folder},
    encoding='Windows-1251'
).map(json.loads)
train_data = db.read_text(
    'zip://ranking_train.jsonl',
    storage_options={'fo': folder},
    encoding='Windows-1251'
).map(json.loads)

In [83]:
test_df = test_data.to_dataframe().compute()
train_df = train_data.to_dataframe().compute()

In [3]:
test_df

Unnamed: 0,text,comments
0,"iOS 8.0.1 released, broken on iPhone 6 models,...",[{'text': 'I&#x27;m still waiting for them to ...
1,Ask HN: How do US HNers get their health insur...,[{'text': 'Get it from your employer. It&#x27;...
2,San Diego Researcher Crowdfunding Patent-Free ...,[{'text': 'What I don&#x27;t understand is why...
3,Rethinking the origins of the universe,[{'text': 'I&#x27;m not a physicist. I imagin...
4,SlackTextViewController: A new growing text in...,[{'text': 'As someone that doesn&#x27;t do iOS...
...,...,...
13999,The cat's miaow,"[{'text': 'Meanwhile in the US, Stubbs has bee..."
14000,Facebook’s Piracy Problem,[{'text': 'A radical idea: Maybe our model of ...
14001,Go GC: Solving the Latency Problem in Go 1.5,[{'text': 'Was the presentation more in-depth ...
14002,Understanding Neural Networks Through Deep Vis...,[{'text': 'Ok now I want to &quot;hear&quot; o...


In [4]:
import nltk
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [5]:
train = train_df.explode(column='comments')
train['comment'] = train.comments.map(lambda dic: dic['text'])
train['score'] = train.comments.map(lambda dic: dic['score'])
train.drop(labels=['comments'], axis=1, inplace=True)
train.head()

Unnamed: 0,text,comment,score
0,How many summer Y Combinator fundees decided n...,Going back to school is not identical with giv...,0
0,How many summer Y Combinator fundees decided n...,There will invariably be those who don't see t...,1
0,How many summer Y Combinator fundees decided n...,For me school is a way to be connected to what...,2
0,How many summer Y Combinator fundees decided n...,I guess it really depends on how hungry you ar...,3
0,How many summer Y Combinator fundees decided n...,I know pollground decided to go back to school...,4


In [6]:
import re
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

class preprocessingWrapper:

  """
    Класс, в рамках которого происходит препроцессинг данных
    для последующих шагов, связанных с EDA и ML. Датафрейм
    оборачивается этим классом и в результате обертки
    возвращается обработанный датасет
  """

  @staticmethod
  def pos_tagger(treebank_tag: str):
    """ Возвращает Part-Of-Speech, относящийся к конкретному значению treebank_tag """
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

  def __init__(self):
    self.stemmer = PorterStemmer()
    self.wordnet_lemmatizer = WordNetLemmatizer()
    self.spec_chars = re.compile(r"[$_&+,:;=?@#|'<>.-^*()%!`]*")
    self.unicode_chars = re.compile(
      "["
      u"\U0001F600-\U0001F64F"  # emoticons
      u"\U0001F300-\U0001F5FF"  # symbols & pictographs
      u"\U0001F680-\U0001F6FF"  # transport & map symbols
      u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
      u"\U00002500-\U00002BEF"  # chinese char
      u"\U00002702-\U000027B0"
      u"\U00002702-\U000027B0"
      u"\U000024C2-\U0001F251"
      u"\U0001f926-\U0001f937"
      u"\U00010000-\U0010ffff"
      u"\u2640-\u2642" 
      u"\u2600-\u2B55"
      u"\u200d"
      u"\u23cf"
      u"\u23e9"
      u"\u231a"
      u"\ufe0f"  # dingbats
      u"\u3030"
      u"\u2705"
      "]+", 
      re.UNICODE
    )
    self.stop_words = set(stopwords.words('english')).union(
      {
          'think', 'one', 'work', '``', 
          "''", '--', '-', 'like', 'people',
          'would', 'get', 'time', 'make', 'really',
          'dont', 'use', 'good', 'much', 'want', 'could',
          'see', 'want', 'thing', 'need', 'something',
          'way', 'even', 'know', 'also', 'year', 'lot',
          'problem', 'company',  'go', 'many', 'well', 
          'using', 'great', 'look', 'new', 'im', 'thing',
          'google', 'still', 'better', 'idea', 'year', 'day',
          'user', 'article', 'code', 'site', 'seems', 'take',
          'first', 'point', 'itxs', 'say'
      }
    )
    self.garbage = ['http']
    
  def rem_stopwords(self, tokens):
    return filter(
      lambda token: token not in self.stop_words and 
      not any(x in token for x in self.garbage), tokens
    )

  def stem(self, tokens):
    return [self.stemmer.stem(token) for token in tokens]

  def lemma(self, tokens):
    return [
      self.wordnet_lemmatizer.lemmatize(token) if token.isalpha() 
      else token for token in tokens 
    ]

  def __call__(
    self, df: pd.DataFrame, column_for_cleaning: str, op: str
  ):
    """ 
      Этот основной метод позволяет 'обернуть' датафрейм и очистить столбец 
      
      df: pd.DataFrame
        - Датафрейм для очистки
      column_for_cleaning: str
        - Столбец для очистки
      op: str ('stem'/'lemma')
        - Какую операцию в итоге провести над токенами    
    """
    if op not in ['stem', 'lemma']:
      raise Exception(
        'op принимает значения "stem" или "lemma"'
      )

    column = df[column_for_cleaning]
    column = column.str.lower()

    # удаление спец. символов
    column = column.str.replace(self.spec_chars, '', regex=True) 
    column = column.str.replace('[^\d\sA-Za-z]', '')

    # удаление других юникодных символов
    column = column.str.replace(self.unicode_chars, '', regex=True)

    # обработка кастомных паттернов
    column = column.str.replace('hacker news','hn')
    column = column.str.replace('ask hn','askhn')
    column = column.str.replace('ask yc','askyc')
    column = column.str.replace('show hn','showhn')
    column = column.str.replace('hn best','besthn')
    column = column.str.replace('tell hn','tellhn')
    column = column.str.replace('quot','')
    column = column.str.replace('hn review','reviewhn')

    # токенизация
    column = column.apply(word_tokenize)

    # удаление стоп-слов
    column = column.apply(self.rem_stopwords)

    if op == 'stem':
      # стемматизация
      return column.apply(self.stem)
    else:
      # лемматизация
      return column.apply(self.lemma)

In [84]:
test = test_df.explode(column='comments')
test['comment'] = test.comments.map(lambda dic: dic['text'])
test['score'] = test.comments.map(lambda dic: dic['score'])
test.drop(labels=['comments'], axis=1, inplace=True)
test.head()

Unnamed: 0,text,comment,score
0,"iOS 8.0.1 released, broken on iPhone 6 models,...",I&#x27;m still waiting for them to stabilize w...,
0,"iOS 8.0.1 released, broken on iPhone 6 models,...","For those who upgraded, no need to do a restor...",
0,"iOS 8.0.1 released, broken on iPhone 6 models,...",Upgraded shortly after it was released and suf...,
0,"iOS 8.0.1 released, broken on iPhone 6 models,...",I think they were under a lot of pressure on t...,
0,"iOS 8.0.1 released, broken on iPhone 6 models,...",Fix for those who already updated: http:&#x2F...,


In [85]:
preprocesser = preprocessingWrapper()
stemmed_posts_test = preprocesser(test, 'text', 'stem')
stemmed_comments_test = preprocesser(test, 'comment', 'stem')

  column = column.str.replace('[^\d\sA-Za-z]', '')


In [None]:
# формирование столбца соединенного из лемматизированных названия и коммента
# ресет индекса для более удобной индексации на этапе формирования массивов X и Y
test['full_text'] = stemmed_posts_test.str.join(' ') + ' ||| ' + stemmed_comments_test.str.join(' ')
test.reset_index(inplace=True)
test.head(2)

In [8]:
stemmed_posts = pd.read_csv('./stemmed_posts.csv', index_col=0)
stemmed_comments = pd.read_csv('./stemmed_comments.csv', index_col=0)

stemmed_posts['text'] = stemmed_posts['text'].str.replace("'",'') \
.str.rstrip(']').str.lstrip('[').str.replace(' ','').str.split(',')

stemmed_comments['comment'] = stemmed_comments['comment'] \
.str.replace("'",'').str.rstrip(']').str.lstrip('[').str.replace(' ','') \
.str.split(',')

train['full_text'] = stemmed_posts['text'].str.join(' ') + ' ||| ' + stemmed_comments['comment'].str.join(' ')

In [None]:
# формирование семплов из X (fulltext_tfidfs) и Y (scores_for_model)
tfidf_matrix = TfidfVectorizer(decode_error='ignore', analyzer='word').fit_transform(test['full_text'])

In [15]:
train.reset_index(inplace=True)

In [26]:
vect = TfidfVectorizer(decode_error='ignore', analyzer='word')
original_sample = train['full_text'].sample(15000)
tfidf_matrix = vect.fit_transform(
    original_sample
)

In [29]:
def perform_xgb(xy_zipped):
    model = xgb.XGBClassifier(
        n_estimators=1000, max_depth=8, 
        eta=0.5, n_jobs=16,
    )
    model.fit(xy_zipped[0], xy_zipped[1]) 
    y_pred_train = model.predict(xy_zipped[0])
    return y_pred_train, xy_zipped[1], model

vect = TfidfVectorizer(decode_error='ignore', analyzer='word')
original_sample = train['full_text'].sample(10000)
tfidf_matrix = vect.fit_transform(
    original_sample
)
xgb_res = perform_xgb((
    scipy.sparse.csr_matrix(tfidf_matrix), 
    train['score'][original_sample.index]
))

In [30]:
from sklearn.metrics import f1_score
f1_score(train['score'][original_sample.index], xgb_res[0], average='weighted')

0.951797033038659

In [88]:
test.reset_index(inplace=True)
new_matrix = vect.transform(
    test['full_text']
)
predicted_scores = xgb_res[2].predict(new_matrix)
test['score'] = predicted_scores
ungrouped_table = pd.DataFrame(test.groupby('index'))
ungrouped_table[1] = ungrouped_table[1].apply(lambda ser: {
        'text': ser['text'].unique()[0],
        'comments': [
                {'text': comment, 'score': score}
                for comment, score in zip(ser['comment'], ser['score'])
        ]
    } 
)
grouped_table = pd.DataFrame(ungrouped_table[1].tolist())
with open('test_predicted_xgboost.jsonl', "w", encoding='utf-8') as f:
    f.write(grouped_table.to_json(orient='records', lines=True))

In [89]:
grouped_table.head(2)

Unnamed: 0,text,comments
0,"iOS 8.0.1 released, broken on iPhone 6 models,...",[{'text': 'I&#x27;m still waiting for them to ...
1,Ask HN: How do US HNers get their health insur...,[{'text': 'Get it from your employer. It&#x27;...
