In [1]:
# Silence RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility.
# It's a bening warning when Scipy was compiled against an older Numpy version. Safe to
# ignore.

import warnings

warnings.filterwarnings("ignore", message="numpy.dtype size changed")

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

docs = [("Ukraine demands 15 year sentence for ousted President and former Paul Manafort "
         'client, Viktor Yanukovych, accusing him of "betraying his country" to Russia.'),
        
        "Officials worry Trump may back Erik Prince plan to privatize war in Afghanistan",
        
        "Donald Trump: U.S. 'Not Even Talking' To Canada About NAFTA",
        
        "Russia's first ever gay Pride banned within 24 hours of being announced",
        
        'Trump is a "Russian asset" owned by the mafia, author claims in new book',
        
        ("Special counsel Robert Mueller proposes 'well over' 1,000 pieces of evidence for "
         "next trial of ex-Trump campaign boss Paul Manafort: Attorneys"),
        
        ("China flat out denies the mass incarceration of Xinjiang’s Uyghurs as testimonies "
         "trickle out"),
        
        "Pentagon worried that U.S. might slide from trade war into actual war with China",
        
        "US says ready to sanction China for buying Iran's oil",
        
        ("Traders are heavily betting on a price decline in coffee as orange juice prices "
         "soar – all because of crops, currencies and the US/China trade war")
       ]

# TfidfVectorizer can be instanced with many useful parameters, e.g.,
# ngram_range (for ngram instead of 1-gram/word features), lowercase,
# min_df and max_df (for feature min/max document frequency filtering)
# strip_accents, tokenizer, norm (l1, l2 normalization), smooth_idf
# (on by default, additive smoothing), analyzer (which allows extracting
# character-level features, useful as morphological features).
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(docs)

tfidf_matrix.shape

(10, 88)

In [3]:
# Lets compare the last document to all the other documents.
# As expected, the most similar document is 8, which shares
# the words 'China', 'trade', 'war'. 'US' is not considered
# since it's part of Sklearn's English stopwords. The other
# documents that have some similarity are 2 (shares 'war')
# and 7 and 9, which share 'China' with the last document.

from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(tfidf_matrix[9:10], tfidf_matrix)

array([[0.        , 0.05284215, 0.        , 0.        , 0.        ,
        0.        , 0.04064783, 0.22538064, 0.04653587, 1.        ]])

In [4]:
from copy import copy

# Now lets try making a query with a new document. As
# expected, most similar documents are 1 and 6, which
# both talk about Paul Manafort, and Russia and Trump
# respectively. Documents 2, 3 and 5 mentions Trump,
# and document 4 Russia, so they also have some
# similarity. Document 5 would have a higher score if
# 'Russian', through stemming, term expansion or
# some other technique, were related to 'Russia'.

query = "Paul Manafort Trump and Russia"
query_vector = tfidf_vectorizer.transform([query])
cosine_similarity(query_vector, tfidf_matrix)

array([[0.34488089, 0.09582984, 0.12859623, 0.17267964, 0.09325757,
        0.30233245, 0.        , 0.        , 0.        , 0.        ]])