In [108]:
import re
import numpy as np
import pandas as pd
from pprint import pprint
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import spacy
from collections import Counter
import math

#### Подготовительная часть для обучения модели

In [1]:
mallet_path = 'C:\\Users\\Yana\\Desktop\\school\\NLP\\mallet-2.0.8\\bin\\mallet'

In [109]:
df = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')
df.head()

Unnamed: 0,content,target,target_names
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,rec.autos
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,comp.sys.mac.hardware
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4,comp.sys.mac.hardware
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1,comp.graphics
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14,sci.space


In [6]:
data = df.content.values.tolist()

# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

pprint(data[:1])

['From: (wheres my thing) Subject: WHAT car is this!? Nntp-Posting-Host: '
 'rac3.wam.umd.edu Organization: University of Maryland, College Park Lines: '
 '15 I was wondering if anyone out there could enlighten me on this car I saw '
 'the other day. It was a 2-door sports car, looked to be from the late 60s/ '
 'early 70s. It was called a Bricklin. The doors were really small. In '
 'addition, the front bumper was separate from the rest of the body. This is '
 'all I know. If anyone can tellme a model name, engine specs, years of '
 'production, where this car is made, history, or whatever info you have on '
 'this funky looking car, please e-mail. Thanks, - IL ---- brought to you by '
 'your neighborhood Lerxst ---- ']


In [7]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

In [110]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [12]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [13]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

data_words_bigrams = make_bigrams(data_words_nostops)

nlp = spacy.load("en_core_web_sm")

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [14]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

#### Функция для выбора оптимального количества топиков

In [21]:
def choose_num_topics(mallet_path, corpus, id2word, data_lemmatized):
    best_score = 0
    best_num = 0
    for i in range(1, 31, 2):
        ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=i, id2word=id2word, random_seed=100)
        coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
        coherence_ldamallet = coherence_model_ldamallet.get_coherence()
        if coherence_ldamallet > best_score:
            best_score = coherence_ldamallet
            best_num = i
    return best_num, best_score

In [22]:
best_num, best_score = choose_num_topics(mallet_path, corpus, id2word, data_lemmatized)

In [23]:
best_num, best_score

(27, 0.542053875860002)

#### Лучшее качество получилось при 27 топиках, так что запускаем с этим числом

In [24]:
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=best_num, id2word=id2word, random_seed=100)

In [38]:
topics = ldamallet.show_topics(formatted=False, num_topics=27)

#### Сделаем словарь с топиками и их словами, чтобы потом найти главный топик

In [39]:
topic_dict = {}
for topic in topics:
    pairs = topic[1]
    topic_words = {}
    for pair in pairs:
        topic_words[pair[0]] = pair[1]
    topic_dict[topic[0]] = topic_words

In [112]:
topic_dict[0].keys()

dict_keys(['time', 'day', 'back', 'hear', 'call', 'long', 'work', 'line', 'week', 'month'])

#### Функция поиска главной темы
Суммирует веса встречающихся слов из топика в тексте и считает главным тот топик, где сумма получилась больше

In [118]:
def find_main_topic(text, topic_dict):
    scores = Counter()
    for word in text:
        for i in range(len(topic_dict)):
            if word in topic_dict[i].keys():
                scores[i] += topic_dict[i][word]
    return scores.most_common(1), scores

In [119]:
all_mains = []
for text in data_lemmatized:
    main_topic, scores = find_main_topic(text, topic_dict)
    if main_topic == []:
        all_mains.append('None')
    else:
        all_mains.append(main_topic[0][0])

In [123]:
all_mains[20:30]

[13, 13, 13, 13, 15, 15, 13, 13, 23, 13]

Как видим, большая часть текстов имеет главный топик 13, что подозрительно. Наверное, вычислять главный топик не так просто, как написано в описании домашки, и просто суммировать недостаточно. Поэтому я просто воспользуюсь встроенными возможностями маллета...

In [141]:
t = ldamallet[corpus]

In [146]:
all_mains_mallet = []
for text in t:
    main_topic = text[0][0]
    all_mains_mallet.append(main_topic)

In [148]:
all_mains_mallet[20:30]

[23, 17, 4, 16, 15, 12, 3, 13, 25, 1]

Так получше.

#### Сделаем датафрейм, чтобы разделить на группы по топикам и посчитаем тф-идф
Сначала в датафрейме будут не нормальные тексты, а лемматизированные для удобства

In [149]:
texts = data_lemmatized

In [159]:
df = pd.DataFrame()
df['lemmatized_text'] = data_lemmatized
df['main_topic_mallet'] = all_mains_mallet
df['main_topic'] = all_mains
df.head()

Unnamed: 0,lemmatized_text,main_topic_mallet,main_topic
0,"[where, thing, car, nntp_poste, host, park, li...",1,13
1,"[poll, final, call, summary, final, call, cloc...",6,13
2,"[engineering, computer, network, distribution_...",6,13
3,"[division, line, host, write, write, article, ...",23,13
4,"[question, distribution, article, write, clear...",19,13


#### Посчитаем IDF для каждого топика

In [169]:
def computeIDF(documents):
    idfDict = {}

    all_count_dict = {}
    for doc in documents:
        for word in doc.keys():
            if doc[word] != 0:
                if word in all_count_dict:
                    all_count_dict[word] += 1
                else:
                    all_count_dict[word] = 1
    for word in all_count_dict.keys():
        idfDict[word] = math.log(len(documents)/all_count_dict[word])
    
    return idfDict

In [72]:
def get_word_counter(text):
    text_counter = Counter()
    for word in text:
        text_counter[word] += 1
    return text_counter

In [170]:
topic_idfs = {}
for i in range(len(topic_dict)):
    topic_texts = df.loc[df['main_topic_mallet'] == i, 'lemmatized_text']
    counter_documents = []
    for text in topic_texts:
        text_counter = get_word_counter(text)
        counter_documents.append(text_counter)
    idf = computeIDF(counter_documents)
    topic_idfs[i] = idf

#### Посчитаем tf-idf для каждого текста

In [66]:
def computeTF(text_counter, text_list):
    tfDict = {}
    for word in text_counter.keys():
        tfDict[word] = text_counter[word]/len(text_list)
    return tfDict

In [86]:
def computeTFIDF(tfBagOfWords, idfs):
    tfidf = {}
    for word, val in tfBagOfWords.items():
        tfidf[word] = val * idfs[word]
    return tfidf

In [193]:
all_best_tfidfs = []
for i in range(len(texts)):
    text_counter = get_word_counter(texts[i])
    tfDict = computeTF(text_counter, texts[i])
    
    topic_num = df.iloc[i]['main_topic_mallet']
    idf = topic_idfs[topic_num]
    tfidf = computeTFIDF(tfDict, idf)
    
    best = Counter(tfidf).most_common(5)
    best_words_only = [word[0] for word in best]
    all_best_tfidfs.append(best_words_only)

In [197]:
all_best_tfidfs[:10]

[['bricklin', 'tellme', 'lerxst', 'door', 'where'],
 ['poll', 'final', 'clock', 'day', 'acceleration'],
 ['bunch', 'display', 'powerbook', 'hear', 'appearence'],
 ['division', 'chip', 'quadrilateral', 'fill', 'weitek'],
 ['warn', 'error', 'unexpected', 'dumb', 'parity_error'],
 ['weapon', 'needless', 'individual', 'modern', 'term'],
 ['treatment', 'tumor', 'astrocytoma', 'accidentally', 'glad'],
 ['scsi', 'range', 'esdi', 'chip', 'indeed'],
 ['icon', 'win', 'appreciated', 'figure', 'wallpaper'],
 ['board', 'autodoubler', 'diskdoubler', 'licensing', 'icon']]

#### Сделаем табличку

In [200]:
df_result = pd.DataFrame()
df_result['lemmatized_text'] = data
df_result['main_topic_mallet'] = all_mains_mallet
df_result['best TF-IDF'] = all_best_tfidfs
df_result.head(15)

Unnamed: 0,lemmatized_text,main_topic_mallet,best TF-IDF
0,From: (wheres my thing) Subject: WHAT car is t...,1,"[bricklin, tellme, lerxst, door, where]"
1,From: (Guy Kuo) Subject: SI Clock Poll - Final...,6,"[poll, final, clock, day, acceleration]"
2,From: (Thomas E Willis) Subject: PB questions....,6,"[bunch, display, powerbook, hear, appearence]"
3,From: (Joe Green) Subject: Re: Weitek P9000 ? ...,23,"[division, chip, quadrilateral, fill, weitek]"
4,From: (Jonathan McDowell) Subject: Re: Shuttle...,19,"[warn, error, unexpected, dumb, parity_error]"
5,From: (Foxvog Douglas) Subject: Re: Rewording ...,26,"[weapon, needless, individual, modern, term]"
6,From: (brian manning delaney) Subject: Brain T...,4,"[treatment, tumor, astrocytoma, accidentally, ..."
7,From: (GRUBB) Subject: Re: IDE vs SCSI Organiz...,6,"[scsi, range, esdi, chip, indeed]"
8,From: Subject: WIn 3.0 ICON HELP PLEASE! Organ...,16,"[icon, win, appreciated, figure, wallpaper]"
9,From: (Stan Kerr) Subject: Re: Sigma Designs D...,6,"[board, autodoubler, diskdoubler, licensing, i..."
