In [17]:
import pickle
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
from itertools import combinations
from collections import Counter
from fuzzywuzzy import fuzz
from utils.get_dates import * 

#from langdetect import detect




In [7]:
from utils.nlp_utils import NLPClass

nlp_class = NLPClass()
TOPIC = 'immigrants'



Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
with open(f'{TOPIC}/temp/article_urls.pkl', 'rb') as f:
    urls = pickle.load(f)
    
df = pd.DataFrame(columns=['search_terms', 'url'])


for key, values in urls.items():
    for value in values:
        row = [key, value]
        df.loc[len(df)] = row


In [9]:
df = df.head(10)

In [10]:
df['article_infos'] = df.url.progress_apply(lambda x: nlp_class.get_article_infos(x))

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:05<00:00,  1.94it/s]


In [13]:
df['text'] = df.article_infos.progress_apply(lambda x: x.text if x is not None else None)
df['publish_date'] = df.article_infos.progress_apply(lambda x: x.publish_date if x is not None else None)
del df['article_infos']
df['split_text'] = df['text'].progress_apply(lambda x: nlp_class.split_text(x))


100%|████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 3324.59it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 5029.74it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 32.69it/s]


In [14]:
df_paragraph = df.explode('split_text').reset_index(drop=True)
df_paragraph.drop('text',inplace=True, axis=1)

#delete  line below if you want to run the full process
df_paragraph = df_paragraph.head(50)
df_paragraph['entities'] = df_paragraph.split_text.progress_apply(lambda x: nlp_class.return_ner_labels(x[0:10000]))
df_paragraph['sentiment'] = df_paragraph['split_text'].progress_apply(lambda x: nlp_class.sentiment_scorer(x))


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:16<00:00,  3.03it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:18<00:00,  2.70it/s]


In [15]:

df_paragraph['keyword'] = df_paragraph.search_terms.apply(lambda x: x[0])
df_paragraph['site'] = df_paragraph.search_terms.apply(lambda x: x[1])

columns_order = ['keyword', 'site', 'url', 'publish_date', 'sentiment', 'split_text', 'entities']
df_paragraph = df_paragraph[columns_order]

missing_urls_df = df_paragraph[df_paragraph.publish_date.isnull()]
not_missing = df_paragraph[df_paragraph.publish_date.isnull() == False]
not_missing['publish_date'] = not_missing.publish_date.apply(lambda x: x.date())

missing_urls_dates_dict = {}

for url in tqdm(missing_urls_df['url'].unique()):
    date = get_date_for_url(url)
    missing_urls_dates_dict.update({url: date})

missing_urls_df['publish_date'] = missing_urls_df.url.apply(lambda x: missing_urls_dates_dict[x])
#missing_urls_df['publish_date'] = missing_urls_df.publish_date.apply(lambda x: x.date() if x is not None and type(x) != str else None)
df = pd.concat([not_missing, missing_urls_df])
df['publish_date'] = df.publish_date.apply(lambda x: x.date() if type(x) == datetime else x)
df['publish_date'] = df.publish_date.apply(lambda x: None if x == '' else x)
df['persons_ner'] = df.entities.apply(lambda entity_list: [x[0] for x in entity_list if x[1] == 'PERSON' ])
#df['persons_ner'] = df['persons_ner'].apply(lambda x: x[:-3] if len(x)>3 else [])

0it [00:00, ?it/s]


In [18]:

LIMIT = 0
person_cnt = [x for x in Counter([item for sublist in list(df['persons_ner']) for item in sublist]).most_common() if x[1] > LIMIT]
persons = list(set([item.lower() for sublist in list(df['persons_ner']) for item in sublist if item in set([x[0] for x in person_cnt])]))

duplications = []
for name_tuple in tqdm(set(combinations(persons, 2))):
    #if len(name_tuple[0]) < 3 or len(name_tuple[1]) < 3:
    #    continue
    similarity = fuzz.token_set_ratio(name_tuple[0], name_tuple[1])
    if similarity == 100:
        if name_tuple[0] == name_tuple[1]:
            continue
        duplications.append(sorted(name_tuple, key = len))

duplication_dict = {x[0] : [] for x in duplications }
for name in duplication_dict.keys():
    duplication_dict[name] = [x[1] for x in duplications if x[0] == name]

normalization_dict = {}

for key, value in duplication_dict.items():
    if len(value) == 1:
        normalization_dict[key] = value[0]
    elif len(value) > 1:
        reference_value = sorted(value, key = len, reverse = True)[0]

        #for new_key in sorted(value, key = len)[1:]:
        #    flattened_dict.update({new_key : reference_value})
        normalization_dict[key] = reference_value

        

df['persons_ner_normalized'] = df.persons_ner.progress_apply(lambda entity_list: [normalization_dict.get(entity.lower(), entity.lower()) for entity in entity_list])
df['persons_ner_normalized'] = df['persons_ner_normalized'].apply(lambda x: [re.sub(r"[^a-zA-Z\s]", "", str(item).replace("'s", "").replace("//n", "").strip()).title() for item in x])
df['persons_ner_normalized_unique'] = df['persons_ner_normalized'].apply(lambda x: set(x))

100%|████████████████████████████████████████████████████████████████████████████████| 78/78 [00:00<00:00, 7915.12it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 4578.63it/s]


In [22]:
df['org_ner'] = df.entities.apply(lambda entity_list: [x[0] for x in entity_list if x[1] == 'ORG' ])
df['org_ner_unique'] = df['org_ner'].apply(lambda x: set(x))

In [23]:
df.to_parquet(f'{TOPIC}/preprocessed_output.parquet')