# motive: vectorize the preprocessed data

In [1]:
import numpy as np 
import pandas as pd 
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics.pairwise import cosine_similarity

from tqdm import tqdm
tqdm.pandas()

In [2]:
data = pd.read_csv("../data/preprocessed/bbc_toi_yahoo_preprocessed_0_8.csv")
data.head(3)

Unnamed: 0,heading,content
0,murdering abuse domestic man partner swansea f...,bottles without family ferocious assault never...
1,second covid-19 prevent deadly to india wave f...,health immunity second march narendra covid-19...
2,blow say firm blaze northampton owners cruel,fire swept urged felt who devastating tackle h...


In [3]:
vectorizer = TfidfVectorizer(
    input='content',
    encoding='utf-8',
    decode_error='strict',
    strip_accents=None,
    lowercase=True,
    preprocessor=None,
    tokenizer=None,
    analyzer='word',
    stop_words=None,
    token_pattern=r"(?u)\b\w\w+\b",
    ngram_range=(1, 1),
    max_df=1.,
    min_df=0.01,
    max_features=None,
    vocabulary=None,
    binary=False,
    dtype=np.float64,
    norm='l2',
    use_idf=True,
    smooth_idf=True,
    sublinear_tf=True
)

In [4]:
content_vectors_tfidf = vectorizer.fit_transform(data.loc[:, "content"])
heading_vectors_tfidf = vectorizer.fit_transform(data.loc[:, "heading"])
df_vectors_tfidf = pd.DataFrame.sparse.from_spmatrix(
    data=content_vectors_tfidf,
    columns=[f"content_{i}" for i in range(content_vectors_tfidf.shape[1])]
)
df_vectors_tfidf_h = pd.DataFrame.sparse.from_spmatrix(
    data=heading_vectors_tfidf,
    columns=[f"heading_{i}" for i in range(heading_vectors_tfidf.shape[1])]
)

df_vectors_tfidf = pd.concat(objs=[df_vectors_tfidf_h, df_vectors_tfidf], axis=1)

In [5]:
df_vectors_tfidf.to_csv("../data/vectorized/vectorized_tfidf.csv", index=False)
df_vectors_tfidf.head()

Unnamed: 0,heading_0,heading_1,heading_2,heading_3,heading_4,heading_5,heading_6,heading_7,heading_8,heading_9,...,content_1337,content_1338,content_1339,content_1340,content_1341,content_1342,content_1343,content_1344,content_1345,content_1346
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.078823,0.101704,0.0,0.0,0.155937,0.0
1,0.689996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.099908,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.220102,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.087057,0.0,0.0,0.0,0.172226,0.0


## spacy vectors

In [6]:
import spacy
import en_core_web_lg

In [7]:
nlp = en_core_web_lg.load(disable = ['ner', 'tagger', 'parser'])
nlp("bruh this is nlp").vector.shape

(300,)

In [8]:
def vectorize(document, nlp):
    vector_output = []
    for word in document.split(" "):
        if word in nlp.vocab:
            nlp_word = nlp(word)
            vector_output.append(nlp(nlp_word[-1].lemma_).vector)
        else:
            nlp.vocab.set_vector(word, np.random.randn(300))
            vector_output.append(nlp(word).vector)
    
    return np.mean(vector_output, axis=0).reshape(-1)

In [9]:
heading_vectors_spacy = data.loc[:, "heading"].progress_apply(
    lambda article: vectorize(document=article, nlp=nlp)
)
content_vectors_spacy = data.loc[:, "content"].progress_apply(
    lambda article: vectorize(document=article, nlp=nlp)
)

100%|██████████| 7900/7900 [00:05<00:00, 1349.96it/s]
100%|██████████| 7900/7900 [00:28<00:00, 273.46it/s]


In [10]:
df_vectors_spacy = pd.DataFrame(
    data=np.concatenate(content_vectors_spacy.values.reshape(-1)).reshape(-1, 300),
    columns=[f"content_{i}" for i in range(300)]
)
df_vectors_spacy_h = pd.DataFrame(
    data=np.concatenate(heading_vectors_spacy.values.reshape(-1)).reshape(-1, 300),
    columns=[f"heading_{i}" for i in range(300)]
)
df_vectors_spacy = pd.concat(objs=[df_vectors_spacy_h, df_vectors_spacy], axis=1)

In [11]:
df_vectors_spacy.to_csv("../data/vectorized/vectorized_spacy.csv", index=False)
print(df_vectors_spacy.shape)
df_vectors_spacy.head()

(7900, 600)


Unnamed: 0,heading_0,heading_1,heading_2,heading_3,heading_4,heading_5,heading_6,heading_7,heading_8,heading_9,...,content_290,content_291,content_292,content_293,content_294,content_295,content_296,content_297,content_298,content_299
0,-0.546659,-0.261546,-0.348887,-0.238572,0.458753,-0.179026,0.239807,0.192216,-0.17371,0.316083,...,-0.114826,0.19385,0.022486,0.117654,0.031892,-0.145199,0.073625,0.07731,0.024157,0.009043
1,-0.374014,0.0798,-0.307247,-0.388248,0.658381,-0.388981,0.700401,0.058361,0.230552,-0.509046,...,-0.068768,-0.062376,0.025082,-0.042379,-0.101772,0.114503,0.123833,0.048265,0.000809,-0.040062
2,0.182761,-0.229877,-0.521939,0.445496,0.392754,-0.374589,0.183456,0.574266,0.218962,-0.057935,...,-0.082321,0.121098,0.011565,-0.15794,0.014343,-0.037302,0.032055,0.066054,-0.068794,0.051896
3,-0.148774,0.765963,-0.061879,0.101832,0.19975,0.392823,-0.272694,-0.073629,0.138436,-0.15462,...,-0.152583,-0.24743,0.172976,0.074123,0.046913,-0.140058,0.0929,0.050058,-0.136006,-0.101694
4,-0.236198,-0.339241,-0.252501,-0.046377,-0.60045,0.195566,-0.266664,0.230694,0.595979,-0.16933,...,-0.071191,0.213485,0.115562,0.0336,0.121582,-0.068243,-0.059087,-0.006863,0.098188,-0.085795


In [12]:
cosine_similarity(df_vectors_spacy.loc[13].values.reshape(1, -1), df_vectors_spacy.loc[34].values.reshape(1, -1))

array([[0.11840403]], dtype=float32)

In [13]:
cosine_similarity(np.array(df_vectors_tfidf.loc[13].values).reshape(1, -1),
np.array(df_vectors_tfidf.loc[1].values).reshape(1, -1))

array([[0.3291562]])