In [138]:
import pandas as pd
import nltk
import gensim
import spacy
import numpy as np
from scipy.spatial.distance import pdist,squareform
import tensorflow as tf
import tensorflow_hub as hub

In [139]:
data_file = './news.csv'
text_col = 'title'

In [140]:
df = pd.read_csv(data_file, header=None, names=['class', 'title', 'text'])

# Raw text

In [141]:
df[text_col][:10]

0                    Fears for T N pension after talks
1    The Race is On: Second Private Team Sets Launc...
2        Ky. Company Wins Grant to Study Peptides (AP)
3        Prediction Unit Helps Forecast Wildfires (AP)
4          Calif. Aims to Limit Farm-Related Smog (AP)
5    Open Letter Against British Copyright Indoctri...
6                         Loosing the War on Terrorism
7    FOAFKey: FOAF, PGP, Key Distribution, and Bloo...
8                     E-mail scam targets police chief
9                    Card fraud unit nets 36,000 cards
Name: title, dtype: object

# 1. Preprocessing

# Сleaning

In [142]:
df['lower'] = df[text_col].str.lower()
df['lower'][:10]

0                    fears for t n pension after talks
1    the race is on: second private team sets launc...
2        ky. company wins grant to study peptides (ap)
3        prediction unit helps forecast wildfires (ap)
4          calif. aims to limit farm-related smog (ap)
5    open letter against british copyright indoctri...
6                         loosing the war on terrorism
7    foafkey: foaf, pgp, key distribution, and bloo...
8                     e-mail scam targets police chief
9                    card fraud unit nets 36,000 cards
Name: lower, dtype: object

In [144]:
df['az'] = df['lower'].str.replace('[^a-z]', ' ') # A-Za-z, а-я, etc...
df['az'][:10]

0                    fears for t n pension after talks
1    the race is on  second private team sets launc...
2        ky  company wins grant to study peptides  ap 
3        prediction unit helps forecast wildfires  ap 
4          calif  aims to limit farm related smog  ap 
5    open letter against british copyright indoctri...
6                         loosing the war on terrorism
7    foafkey  foaf  pgp  key distribution  and bloo...
8                     e mail scam targets police chief
9                    card fraud unit nets        cards
Name: az, dtype: object

In [145]:
df = df.dropna(subset=['az'])

# Tokenization

In [146]:
df['token_1'] = df['az'].str.split(' ')
df['token_1'][:10]

0            [fears, for, t, n, pension, after, talks]
1    [the, race, is, on, , second, private, team, s...
2    [ky, , company, wins, grant, to, study, peptid...
3    [prediction, unit, helps, forecast, wildfires,...
4    [calif, , aims, to, limit, farm, related, smog...
5    [open, letter, against, british, copyright, in...
6                   [loosing, the, war, on, terrorism]
7    [foafkey, , foaf, , pgp, , key, distribution, ...
8              [e, mail, scam, targets, police, chief]
9       [card, fraud, unit, nets, , , , , , , , cards]
Name: token_1, dtype: object

In [147]:
from nltk.tokenize.api import StringTokenizer

df['token_2'] = df['az'].apply(nltk.word_tokenize)
df['token_2'][:10]

0            [fears, for, t, n, pension, after, talks]
1    [the, race, is, on, second, private, team, set...
2    [ky, company, wins, grant, to, study, peptides...
3    [prediction, unit, helps, forecast, wildfires,...
4    [calif, aims, to, limit, farm, related, smog, ap]
5    [open, letter, against, british, copyright, in...
6                   [loosing, the, war, on, terrorism]
7    [foafkey, foaf, pgp, key, distribution, and, b...
8              [e, mail, scam, targets, police, chief]
9                     [card, fraud, unit, nets, cards]
Name: token_2, dtype: object

# Filter too short and too long

In [22]:
len_filter = lambda doc: [token for token in doc if len(token) >= 3 and len(token) <= 15]

In [148]:
df['token_1_len'] = df['token_1'].apply(len_filter)
df['token_1_len'][:10]

0                  [fears, for, pension, after, talks]
1    [the, race, second, private, team, sets, launc...
2              [company, wins, grant, study, peptides]
3       [prediction, unit, helps, forecast, wildfires]
4            [calif, aims, limit, farm, related, smog]
5    [open, letter, against, british, copyright, in...
6                       [loosing, the, war, terrorism]
7    [foafkey, foaf, pgp, key, distribution, and, b...
8                 [mail, scam, targets, police, chief]
9                     [card, fraud, unit, nets, cards]
Name: token_1_len, dtype: object

In [149]:
df['token_2_len'] = df['token_2'].apply(len_filter)
df['token_2_len'][:10]

0                  [fears, for, pension, after, talks]
1    [the, race, second, private, team, sets, launc...
2              [company, wins, grant, study, peptides]
3       [prediction, unit, helps, forecast, wildfires]
4            [calif, aims, limit, farm, related, smog]
5    [open, letter, against, british, copyright, in...
6                       [loosing, the, war, terrorism]
7    [foafkey, foaf, pgp, key, distribution, and, b...
8                 [mail, scam, targets, police, chief]
9                     [card, fraud, unit, nets, cards]
Name: token_2_len, dtype: object

# Filter stopwords

In [25]:
from nltk.corpus import stopwords

In [153]:
stop = stopwords.words('english')
print(stop)
df['no_stop'] = df['token_1_len'].apply(lambda doc: [token for token in doc if token not in stop])
df['no_stop'][:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

0                              [fears, pension, talks]
1    [race, second, private, team, sets, launch, da...
2              [company, wins, grant, study, peptides]
3       [prediction, unit, helps, forecast, wildfires]
4            [calif, aims, limit, farm, related, smog]
5    [open, letter, british, copyright, indoctrinat...
6                            [loosing, war, terrorism]
7    [foafkey, foaf, pgp, key, distribution, bloom,...
8                 [mail, scam, targets, police, chief]
9                     [card, fraud, unit, nets, cards]
Name: no_stop, dtype: object

# Stemming and lemmatization

In [154]:
from nltk.stem.porter import PorterStemmer # simple one
from nltk.stem.snowball import SnowballStemmer # Porter 2
from nltk.stem import WordNetLemmatizer

In [155]:
stemmer = SnowballStemmer('english')
df['stem'] = df['no_stop'].apply(lambda doc: [stemmer.stem(token) for token in doc])
df['stem'][:10]

0                                [fear, pension, talk]
1    [race, second, privat, team, set, launch, date...
2                 [compani, win, grant, studi, peptid]
3             [predict, unit, help, forecast, wildfir]
4               [calif, aim, limit, farm, relat, smog]
5    [open, letter, british, copyright, indoctrin, ...
6                                  [loos, war, terror]
7    [foafkey, foaf, pgp, key, distribut, bloom, fi...
8                   [mail, scam, target, polic, chief]
9                       [card, fraud, unit, net, card]
Name: stem, dtype: object

In [156]:
lemmatizer = WordNetLemmatizer()
df['lem'] = df['no_stop'].apply(lambda doc: [lemmatizer.lemmatize(token) for token in doc])
df['lem'][:10]

0                                [fear, pension, talk]
1    [race, second, private, team, set, launch, dat...
2                [company, win, grant, study, peptide]
3         [prediction, unit, help, forecast, wildfire]
4             [calif, aim, limit, farm, related, smog]
5    [open, letter, british, copyright, indoctrinat...
6                            [loosing, war, terrorism]
7    [foafkey, foaf, pgp, key, distribution, bloom,...
8                  [mail, scam, target, police, chief]
9                       [card, fraud, unit, net, card]
Name: lem, dtype: object

# 2. Vectorization

In [157]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [158]:
df = df[(df['lem'].apply(len) > 0) | (df['stem'].apply(len) > 0)]
df['lem'][:10]

0                                [fear, pension, talk]
1    [race, second, private, team, set, launch, dat...
2                [company, win, grant, study, peptide]
3         [prediction, unit, help, forecast, wildfire]
4             [calif, aim, limit, farm, related, smog]
5    [open, letter, british, copyright, indoctrinat...
6                            [loosing, war, terrorism]
7    [foafkey, foaf, pgp, key, distribution, bloom,...
8                  [mail, scam, target, police, chief]
9                       [card, fraud, unit, net, card]
Name: lem, dtype: object

## Bag of Words (bow)

In [159]:
vectorizer = CountVectorizer()
stem_bow = vectorizer.fit_transform(df['stem'].apply(' '.join))
stem_bow_dimensions = vectorizer.get_feature_names()
print(len(stem_bow_dimensions))

vectorizer = CountVectorizer()
lem_bow = vectorizer.fit_transform(df['lem'].apply(' '.join))
lem_bow_dimensions = vectorizer.get_feature_names()
print(len(lem_bow_dimensions))

7275
8529


In [179]:
df_bow = pd.DataFrame(stem_bow.toarray(), columns=stem_bow_dimensions)

## Term frequency - inverted document frequency (TF-IDF)

In [35]:
vectorizer = TfidfVectorizer()
stem_tfidf = vectorizer.fit_transform(df['stem'].apply(' '.join))
stem_tfidf_dimensions = vectorizer.get_feature_names()
print(len(stem_bow_dimensions))

7275


In [36]:
df_tfidf = pd.DataFrame(stem_tfidf.toarray(), columns=stem_tfidf_dimensions)

## Doc2Vec

In [251]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [264]:
vector_size = 50
steps = 200
window = 4
no_below_filter = 5
random = 42
d2v_model_path = './d2v_model'

In [265]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(df['stem'])]
model = Doc2Vec(documents, vector_size=vector_size, window=window, min_count=no_below_filter, workers=4, seed=random, steps=steps)

W0820 20:10:07.323155 16444 base_any2vec.py:723] consider setting layer size to a multiple of 4 for greater performance
  "C extension not loaded, training will be slow. "
W0820 20:11:50.853715 16444 base_any2vec.py:1386] under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay


In [254]:
model.save(d2v_model_path)

In [248]:
model = Doc2Vec.load(d2v_model_path)

## Universal Sentence Encoder

In [47]:
embed = hub.Module("https://tfhub.dev/google/universal-sentence-encoder-large/3")

In [48]:
emb = None
with tf.Session() as session:
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    embeddings = embed(list(df[text_col]))
    emb = session.run(embeddings)

# 3. Find simillar texts

In [162]:
def build_similarity_matrix(embeddings, metric='cosine'):
    return squareform(pdist(embeddings, metric=metric))

In [163]:
def get_top_n_ind(similarity_matrix, title_id, n):
    closest = similarity_matrix[title_id,:].argsort()[::-1][-top_n:][::-1]
    return closest

### Calc similarity matrices

In [302]:
sm_use = build_similarity_matrix(emb)

In [64]:
sm_bow = build_similarity_matrix(stem_bow.toarray())

In [65]:
sm_tfidf = build_similarity_matrix(stem_tfidf.toarray())

In [320]:
sm_d2v = build_similarity_matrix(model.docvecs.vectors_docs)

### Similar texts

In [306]:
title_id = 5624
title_id = 1

top_n = 20
print('Document:', df.iloc[title_id][text_col])

Document: The Race is On: Second Private Team Sets Launch Date for Human Spaceflight (SPACE.com)


In [321]:
closest_use = get_top_n_ind(sm_use, title_id, top_n)
closest_bow = get_top_n_ind(sm_bow, title_id, top_n)
closest_tfidf = get_top_n_ind(sm_tfidf, title_id, top_n)
closest_d2v = get_top_n_ind(sm_d2v, title_id, top_n)

In [322]:
print('BOW')
print(df.iloc[closest_bow][text_col])

BOW
1       The Race is On: Second Private Team Sets Launc...
5455                       Launch Date Set for Solar Sail
5453                           Solar Sail Launch Date Set
2894    Creators of private spaceship announce plans f...
2770             Date with destiny for private rocketeers
1309                                               SI.com
2925                     EU set to launch 'transit camps'
6565                      EasyMobile launch set for March
386                                                SI.com
7559                      Exploring Andromeda (SPACE.com)
2650    Virgin to Launch Commercial Space Flights by 2007
6968           Japan to Resume Space Rocket Launches (AP)
4782               Racing in an Evening Gown (Forbes.com)
3739                    Next space station crew to launch
4925                     Martin wins second straight race
2790            Salesforce.com launches on-demand support
7472             No Safe Place for Satellites (SPACE.com)
749     'D

In [323]:
print('TFIDF')
print(df.iloc[closest_tfidf][text_col])

TFIDF
1       The Race is On: Second Private Team Sets Launc...
2894    Creators of private spaceship announce plans f...
5453                           Solar Sail Launch Date Set
5455                       Launch Date Set for Solar Sail
2770             Date with destiny for private rocketeers
4925                     Martin wins second straight race
1309                                               SI.com
386                                                SI.com
2842    Going Private: The Promise and Danger of Space...
5123         NASA Picks May 2005 Shuttle Launch Date (AP)
6376               PSG lead race for second Group H berth
6968           Japan to Resume Space Rocket Launches (AP)
7559                      Exploring Andromeda (SPACE.com)
749     'Dream Team' Out of Gold Race After Loss to Ar...
3739                    Next space station crew to launch
2925                     EU set to launch 'transit camps'
4307        Trial Date Set for Soldier at Abu Ghraib (AP)
7268    

In [327]:
print('DOC2VEC')
print(df.iloc[closest_d2v][text_col])

DOC2VEC
1       The Race is On: Second Private Team Sets Launc...
2764    .Mac bumps up storage capacity, improves mail ...
2379    BLOG That #39;s the most look-ed up world on M...
1143         AT amp;T Wireless Moves to Sell Canada Asset
4577    Karzai happy to wait for official Afghan poll ...
600           Belarus Bank Denies Money Laundering Charge
1450    Bank sits tight on rates as house price inflat...
2548                     Jet lands in UK after bomb alert
5667    Greek, British Police Break Illegal Software Ring
5586                  Report: EADS Could Link With Thales
7321                  Trade Gap Swells More Than Expected
3311        At Least 37 Killed, 52 Hurt in Pakistan Blast
2039                                    #39;Noles Rebound
834         Schumacher Clinches Seventh Season Title (AP)
7597                               Martinez leaves bitter
6215                    Iran pledges to halt nuclear work
844             Thousands Hit NYC Streets; Cheney Arrives
2876  

In [325]:
print('USE')
print(df.iloc[closest_use][text_col])

USE
1       The Race is On: Second Private Team Sets Launc...
5446             Solar spacecraft set to launch next year
3717    New Crew Prepares for Launch to International ...
2894    Creators of private spaceship announce plans f...
3004    NASA puts off space shuttle flights until at l...
4915                      Nasa to resume shuttle missions
114     Cassini Space Probe Spots Two New Saturn Moons...
5852    NASA delays flight of X-43A scramjet to attemp...
5718                     NASA to test hypersonic scramjet
5551           European spacecraft prepares to orbit Moon
3739                    Next space station crew to launch
3730    LIVE: Launch of Expedition Ten Crew to the ISS...
5611        Europes First Moon Probe to Enter Lunar Orbit
1346                  Space Capsule Heading Back to Earth
5038                           NASA looking at May launch
5123         NASA Picks May 2005 Shuttle Launch Date (AP)
1762    Technical Hitch Delays Russia Space Station La...
4546    Ca