# motive: vectorize the preprocessed data

In [9]:
import numpy as np 
import pandas as pd 
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics.pairwise import cosine_similarity

from tqdm import tqdm
tqdm.pandas()

In [10]:
data = pd.read_csv("../data/preprocessed/bbc_toi_yahoo_preprocessed_0_8.csv")
data.head(3)

Unnamed: 0,heading,content
0,murdering abuse domestic man partner swansea f...,bottles without family ferocious assault never...
1,second covid-19 prevent deadly to india wave f...,health immunity second march narendra covid-19...
2,blow say firm blaze northampton owners cruel,fire swept urged felt who devastating tackle h...


In [11]:
vectorizer = TfidfVectorizer(
    input='content',
    encoding='utf-8',
    decode_error='strict',
    strip_accents=None,
    lowercase=True,
    preprocessor=None,
    tokenizer=None,
    analyzer='word',
    stop_words=None,
    token_pattern=r"(?u)\b\w\w+\b",
    ngram_range=(1, 1),
    max_df=0.8,
    min_df=0.001,
    max_features=None,
    vocabulary=None,
    binary=False,
    dtype=np.float64,
    norm='l2',
    use_idf=True,
    smooth_idf=True,
    sublinear_tf=True
)

In [12]:
content_vectors_tfidf = vectorizer.fit_transform(data.loc[:, "content"])
heading_vectors_tfidf = vectorizer.fit_transform(data.loc[:, "heading"])
df_vectors_tfidf = pd.DataFrame.sparse.from_spmatrix(
    data=content_vectors_tfidf,
    columns=[f"content_{i}" for i in range(content_vectors_tfidf.shape[1])]
)
df_vectors_tfidf_h = pd.DataFrame.sparse.from_spmatrix(
    data=heading_vectors_tfidf,
    columns=[f"heading_{i}" for i in range(heading_vectors_tfidf.shape[1])]
)

df_vectors_tfidf = pd.concat(objs=[df_vectors_tfidf_h, df_vectors_tfidf], axis=1)
df_vectors_tfidf.shape

(7900, 9559)

In [20]:
df_vectors_tfidf.to_csv("../data/vectorized/vectorized_tfidf.csv", index=False)
df_vectors_tfidf.head()

KeyboardInterrupt: 

## spacy vectors

In [2]:
import spacy
import en_core_web_lg

In [3]:
nlp = en_core_web_lg.load(disable = ['ner', 'tagger', 'parser'])
nlp("bruh this is nlp").vector.shape

(300,)

In [6]:
len(nlp.vocab.strings)

1476268

In [27]:
def vectorize(document, nlp):
    vector_output = []
    for word in document.split(" "):
        word_lemma_ = nlp(word)[0].lemma_
        if word_lemma_ in nlp.vocab:
            vector_output.append(nlp(word_lemma_).vector)
        else:
            vector_output.append(np.zeros(300))
    
    return np.mean(vector_output, axis=0).reshape(-1)

In [28]:
heading_vectors_spacy = data.loc[:, "heading"].progress_apply(
    lambda article: vectorize(document=article, nlp=nlp)
)
content_vectors_spacy = data.loc[:, "content"].progress_apply(
    lambda article: vectorize(document=article, nlp=nlp)
)

100%|██████████| 7900/7900 [00:15<00:00, 510.27it/s]
100%|██████████| 7900/7900 [01:24<00:00, 92.99it/s] 


In [29]:
df_vectors_spacy = pd.DataFrame(
    data=np.concatenate(content_vectors_spacy.values.reshape(-1)).reshape(-1, 300),
    columns=[f"content_{i}" for i in range(300)]
)
df_vectors_spacy_h = pd.DataFrame(
    data=np.concatenate(heading_vectors_spacy.values.reshape(-1)).reshape(-1, 300),
    columns=[f"heading_{i}" for i in range(300)]
)
df_vectors_spacy = pd.concat(objs=[df_vectors_spacy_h, df_vectors_spacy], axis=1)

In [30]:
df_vectors_spacy.to_csv("../data/vectorized/vectorized_spacy.csv", index=False)
print(df_vectors_spacy.shape)
df_vectors_spacy.head()

(7900, 600)


Unnamed: 0,heading_0,heading_1,heading_2,heading_3,heading_4,heading_5,heading_6,heading_7,heading_8,heading_9,...,content_290,content_291,content_292,content_293,content_294,content_295,content_296,content_297,content_298,content_299
0,-0.157753,0.089183,0.049427,-0.026076,0.222166,-0.146739,0.12782,-0.115886,0.041303,1.646289,...,-0.016418,0.003355,-0.026659,-0.04338,0.019822,-0.042713,-0.058243,-0.006581,0.007644,0.005938
1,-0.169899,0.097146,0.017788,0.006765,0.05779,-0.159095,-0.052362,0.060337,0.061138,1.183402,...,-0.06208,0.070301,0.003601,-0.020681,0.018249,-0.01993,-0.144257,-0.055062,0.055794,0.024039
2,-0.108841,0.078665,-0.087501,-0.040552,0.090407,-0.217324,0.095325,0.145851,-0.019544,1.495901,...,-0.164935,0.030613,0.058293,0.013569,-0.041393,-0.060055,-0.016733,-0.041237,-0.013777,-0.018004
3,-0.13898,0.200465,-0.255277,-0.018964,-0.256803,0.161597,-0.037887,-0.315532,0.136203,1.00674,...,-0.1563,0.046135,0.015782,-0.029828,0.085402,-0.052874,-0.038955,-0.03277,0.062129,0.101521
4,-0.085742,0.244554,0.057991,-0.149246,0.166486,0.090718,0.070708,0.11344,-0.028118,1.108528,...,-0.110203,0.047423,0.060913,-0.032498,-0.011796,-0.061332,-0.100022,-0.053009,0.038002,0.003618


In [40]:
cosine_similarity(df_vectors_spacy.loc[226].values.reshape(1, -1), df_vectors_spacy.loc[59].values.reshape(1, -1))

array([[0.55723556]])

In [38]:
data.loc[55:, :]

Unnamed: 0,heading,content
55,country profile ecuador,lasso government ethnic native empire next 600...
56,vaccine astrazeneca halts temporarily oxford i...,rising government recent measure vaccinations ...
57,hamble protesters blocked by bp oil terminal e...,barrels place government fuels jobs energy sup...
58,irish authorities cyber government by crime se...,health thursday rté malware department governm...
59,falls cracks crypto china bitcoin currencies,transactions recovered accept generated mining...
...,...,...
7895,axis group max insurer stake sells to long wai...,axis yearly players entities founder adjusted ...
7896,operate court accounts to indian bytedance ind...,department tax government evasion counsels dep...
7897,ajay at roles position aimed of cmd promoters ...,markets norms governance many entities split t...
7898,titles job home call linkedin india of stay ti...,due descriptions member ayenew allow stigma ma...
