# motive: vectorize the preprocessed data

In [19]:
import numpy as np 
import pandas as pd 
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics.pairwise import cosine_similarity

from tqdm import tqdm
tqdm.pandas()

In [9]:
data = pd.read_csv("../data/preprocessed/bbc_toi_yahoo_preprocessed_0_8.csv")
data.head(3)

Unnamed: 0,heading,content
0,murdering abuse domestic man partner swansea f...,bottles without family ferocious assault never...
1,second covid-19 prevent deadly to india wave f...,health immunity second march narendra covid-19...
2,blow say firm blaze northampton owners cruel,fire swept urged felt who devastating tackle h...


In [10]:
vectorizer = TfidfVectorizer(
    input='content',
    encoding='utf-8',
    decode_error='strict',
    strip_accents=None,
    lowercase=True,
    preprocessor=None,
    tokenizer=None,
    analyzer='word',
    stop_words=None,
    token_pattern=r"(?u)\b\w\w+\b",
    ngram_range=(1, 1),
    max_df=1.,
    min_df=0.01,
    max_features=None,
    vocabulary=None,
    binary=False,
    dtype=np.float64,
    norm='l2',
    use_idf=True,
    smooth_idf=True,
    sublinear_tf=True
)

In [11]:
content_vectors_tfidf = vectorizer.fit_transform(data.loc[:, "content"])
heading_vectors_tfidf = vectorizer.fit_transform(data.loc[:, "heading"])
df_vectors_tfidf = pd.DataFrame.sparse.from_spmatrix(
    data=content_vectors_tfidf,
    columns=[f"content_{i}" for i in range(content_vectors_tfidf.shape[1])]
)
df_vectors_tfidf_h = pd.DataFrame.sparse.from_spmatrix(
    data=heading_vectors_tfidf,
    columns=[f"heading_{i}" for i in range(heading_vectors_tfidf.shape[1])]
)

df_vectors_tfidf = pd.concat(objs=[df_vectors_tfidf_h, df_vectors_tfidf], axis=1)

In [12]:
df_vectors_tfidf.to_csv("../data/vectorized/vectorized_tfidf.csv")
df_vectors_tfidf.head()

Unnamed: 0,heading_0,heading_1,heading_2,heading_3,heading_4,heading_5,heading_6,heading_7,heading_8,heading_9,...,content_1337,content_1338,content_1339,content_1340,content_1341,content_1342,content_1343,content_1344,content_1345,content_1346
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.078823,0.101704,0.0,0.0,0.155937,0.0
1,0.689996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.099908,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.220102,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.087057,0.0,0.0,0.0,0.172226,0.0


## spacy vectors

In [14]:
import spacy
import en_core_web_lg

In [28]:
nlp = en_core_web_lg.load(disable = ['ner', 'tagger', 'parser'])
nlp("bruh this is nlp").vector.shape

(300,)

In [144]:
def vectorize(document, nlp):
    vector_output = []
    for word in document.split(" "):
        if word in nlp.vocab:
            nlp_word = nlp(word)
            vector_output.append(nlp(nlp_word[-1].lemma_).vector)
        else:
            nlp.vocab.set_vector(word, np.random.randn(300))
            vector_output.append(nlp(word).vector)
    
    return np.mean(vector_output, axis=0).reshape(-1)

In [145]:
heading_vectors_spacy = data.loc[:, "heading"].progress_apply(
    lambda article: vectorize(document=article, nlp=nlp)
)
content_vectors_spacy = data.loc[:, "content"].progress_apply(
    lambda article: vectorize(document=article, nlp=nlp)
)

100%|██████████| 7900/7900 [00:03<00:00, 2371.29it/s]
100%|██████████| 7900/7900 [00:21<00:00, 359.16it/s]


In [146]:
df_vectors_spacy = pd.DataFrame(
    data=np.concatenate(content_vectors_spacy.values.reshape(-1)).reshape(-1, 300),
    columns=[f"content_{i}" for i in range(300)]
)
df_vectors_spacy_h = pd.DataFrame(
    data=np.concatenate(heading_vectors_spacy.values.reshape(-1)).reshape(-1, 300),
    columns=[f"heading_{i}" for i in range(300)]
)
df_vectors_spacy = pd.concat(objs=[df_vectors_spacy_h, df_vectors_spacy], axis=1)

In [157]:
df_vectors_spacy.to_csv("../data/vectorized/vectorized_spacy.csv")
print(df_vectors_spacy.shape)
df_vectors_spacy.head()

(7900, 600)


Unnamed: 0,heading_0,heading_1,heading_2,heading_3,heading_4,heading_5,heading_6,heading_7,heading_8,heading_9,...,content_290,content_291,content_292,content_293,content_294,content_295,content_296,content_297,content_298,content_299
0,-0.395403,-0.141887,-0.492862,0.348586,-0.228022,-0.553083,-0.253479,0.587059,0.080005,0.046023,...,-0.130801,0.223726,0.014547,0.16281,-0.044603,-0.040171,-0.056706,-0.039914,0.066073,-0.090223
1,-0.002046,-0.373139,0.246757,0.089691,0.27458,-0.107132,-0.437309,0.017011,-0.074606,0.394424,...,-0.1129,0.02579,0.079606,0.156572,-0.065178,0.029136,-0.011666,-0.071578,-0.103075,-0.034777
2,0.764498,0.476539,-0.689963,-0.005024,0.334456,-0.208725,-0.11603,-0.259741,0.176904,0.119457,...,-0.010727,-0.093692,-0.034266,0.141945,-0.177817,0.093211,0.09108,0.134218,-0.079569,-0.120034
3,-0.331006,-0.404886,0.281067,0.035025,-0.548491,0.123478,-0.088566,-0.284744,-0.504703,-0.204405,...,-0.245446,0.030049,0.103651,-0.242751,-0.070178,0.042648,0.052237,-0.206619,-0.008419,0.120597
4,0.390195,-0.330829,-0.051395,-0.382031,-0.041006,-0.046889,-0.100032,0.363283,-0.519215,0.699297,...,0.014218,-0.093208,0.039565,-0.051338,0.144061,-0.120748,-0.121784,-0.102259,-0.094109,0.129041


In [161]:
cosine_similarity(df_vectors_spacy.loc[13].values.reshape(1, -1), df_vectors_spacy.loc[34].values.reshape(1, -1))

array([[0.11009459]], dtype=float32)

In [165]:
cosine_similarity(np.array(df_vectors_tfidf.loc[13].values).reshape(1, -1),
np.array(df_vectors_tfidf.loc[1].values).reshape(1, -1))

array([[0.3291562]])