In [1]:
import wikipedia

import torch
from transformers import pipeline
from transformers import RobertaTokenizer, RobertaModel

In [1]:
import pandas as pd
pd.set_option('display.max_columns', 50)

import numpy as np
import pickle
import matplotlib.pyplot as plt
import time

from sklearn.decomposition import NMF, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# pull in data
df = pd.read_pickle("../../../data/prd/Paper/FR_meta_and_final_tokens_23DEC21.pkl")
df.reset_index(inplace = True, drop = True)

print(df.shape)

In [None]:
df.head()

In [4]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [107]:
text = ["good", "bad"]
encoded_input = tokenizer(text, return_tensors='pt', padding=True)
output = model(**encoded_input)

In [108]:
encoded_input

{'input_ids': tensor([[    0,  8396,     2],
        [    0, 10999,     2]]), 'attention_mask': tensor([[1, 1, 1],
        [1, 1, 1]])}

In [118]:
output.keys()

odict_keys(['last_hidden_state', 'pooler_output'])

In [119]:
output.last_hidden_state

tensor([[[-0.0600,  0.1117,  0.0125,  ..., -0.0877, -0.0479, -0.0382],
         [-0.0543,  0.0645,  0.1325,  ..., -0.2585, -0.1425, -0.0546],
         [-0.0580,  0.1246, -0.0108,  ..., -0.1436, -0.0497, -0.0861]],

        [[-0.0530,  0.0905,  0.0115,  ..., -0.0906, -0.0578, -0.0353],
         [-0.0506, -0.0965,  0.1254,  ..., -0.2834, -0.0933, -0.0792],
         [-0.0466,  0.0914, -0.0126,  ..., -0.1488, -0.0618, -0.0766]]],
       grad_fn=<NativeLayerNormBackward>)

In [109]:
output[0].shape

torch.Size([2, 3, 768])

In [110]:
output[0][0]

tensor([[-0.0600,  0.1117,  0.0125,  ..., -0.0877, -0.0479, -0.0382],
        [-0.0543,  0.0645,  0.1325,  ..., -0.2585, -0.1425, -0.0546],
        [-0.0580,  0.1246, -0.0108,  ..., -0.1436, -0.0497, -0.0861]],
       grad_fn=<SelectBackward>)

In [111]:
output[0][1]

tensor([[-0.0530,  0.0905,  0.0115,  ..., -0.0906, -0.0578, -0.0353],
        [-0.0506, -0.0965,  0.1254,  ..., -0.2834, -0.0933, -0.0792],
        [-0.0466,  0.0914, -0.0126,  ..., -0.1488, -0.0618, -0.0766]],
       grad_fn=<SelectBackward>)

In [112]:
type(output[0][:,0,:])

torch.Tensor

In [113]:
# doc_vecs contains vectorized documents

doc_vecs = output[0][:,0,:].detach().numpy()
print(doc_vecs)

[[-0.05997714  0.1117099   0.01250292 ... -0.08772442 -0.04788758
  -0.03824044]
 [-0.05302423  0.0905467   0.01149731 ... -0.09061782 -0.05778022
  -0.03527931]]


In [114]:
type(doc_vecs[0])

numpy.ndarray

In [115]:
doc_vecs[0].reshape(1, -1).shape

(1, 768)

In [116]:
cosine_similarity(doc_vecs[0].reshape(1, -1), doc_vecs[1].reshape(1, -1))

array([[0.9996189]], dtype=float32)

In [46]:
# get document representation

doc_vector = output[0][:,0,:].detach().numpy()

print(type(doc_vector))

<class 'numpy.ndarray'>


In [45]:
doc_vector[0]

array([-1.14643313e-01,  1.10333711e-01, -1.48565806e-02, -8.80328715e-02,
        1.13034829e-01, -5.11918180e-02, -3.00938613e-04,  2.94117164e-02,
        2.08435301e-02, -9.86900479e-02, -4.26134691e-02,  2.39708610e-02,
        2.46323347e-02, -6.32747263e-02,  6.43565729e-02, -1.22286044e-02,
       -8.59912559e-02,  1.28469355e-02, -6.85404381e-03, -1.25063034e-02,
       -1.14405125e-01,  1.78088136e-02,  1.03543662e-02,  1.62025899e-01,
       -3.59903574e-02,  8.63939226e-02,  4.45442498e-02,  9.85156000e-02,
       -2.68439967e-02,  4.74899542e-03, -7.75303915e-02, -9.41260830e-02,
        8.12896490e-02,  1.62781458e-02,  1.49947591e-02,  8.35780054e-02,
        1.03193717e-02, -1.94536354e-02, -2.43302137e-02,  4.37730663e-02,
        1.41453734e-02,  1.71921611e-01,  1.89868044e-02,  1.92040962e-03,
        3.52538303e-02,  3.12549099e-02,  1.03037469e-02, -6.07903227e-02,
       -3.01116593e-02, -9.09070484e-03,  1.08087575e-02,  8.74547586e-02,
       -5.55447787e-02,  

In [3]:
print(wikipedia.search("big data"))

['Big data', 'Data', 'Big Data (band)', 'Data science', 'Big data ethics', 'List of big data companies', 'Data analysis', 'Data mining', 'Streaming data', 'Data lake']


In [5]:
print(wikipedia.suggest("Big Data"))

None


In [6]:
print(wikipedia.summary("Big data"))

Big data refers to data sets that are too large or complex to be dealt with by traditional data-processing application software. Data with many fields (rows) offer greater statistical power, while data with higher complexity (more attributes or columns) may lead to a higher false discovery rate. Big data analysis challenges include capturing data, data storage, data analysis, search, sharing, transfer, visualization, querying, updating, information privacy, and data source. Big data was originally associated with three key concepts: volume, variety, and velocity. The analysis of big data presents challenges in sampling, and thus previously allowing for only observations and sampling. Thus a fourth concept, veracity, refers to the quality or insightfulness of the data. Without sufficient investment in expertise for big data veracity, then the volume and variety of data can produce costs and risks that exceed an organization's capacity to create and capture value from big data.Current us