# Notebook performs TF-IDF on BR24 data set to create a baseline system

### /data/processed_data_extra_columns.pkl is used for this notebook

### Results are not saved to a .pkl file

In [1]:
import pickle as pk
import pandas as pd

from tqdm import tqdm
tqdm.pandas()

In [2]:
df = pd.read_pickle("../data/processed_data_extra_columns.pkl", compression='zip')

In [3]:
df.shape

(40862, 16)

In [4]:
df.head(1)

Unnamed: 0,created_at,primary_category,share_url,row_id,tags,title,text,type,clean_text_tfidf,clean_text_tfidf_nltk_stem,clean_text_tfidf_spacy_lemma,clean_text,in_text,out_of_text,in_text_percent,out_of_text_percent
0,2021-08-05 09:44:27.368478+00:00,deutschland-welt,https://www.br.de/nachrichten/deutschland-welt...,SfClRxG,"[Ayatollah Ali Khamenei, Amtseinführung, Iran,...","Amtseinführung im Iran: Neuer Präsident, alte ...",&quot;Hier ist Bayern&quot;: Der BR24 Newslett...,TEXT,Bayern BR Newsletter informiert immer montags ...,Bay br Newslett informiert imm montag freitag ...,Bayer BR Newsletter informieren immer montags ...,"""Hier ist Bayern"": Der BR Newsletter informier...","[Ayatollah Ali Khamenei, Iran, Präsident, Ebra...",[Amtseinführung],80,20


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer 
tfidf_vectorizer = TfidfVectorizer(use_idf=True, smooth_idf=True, lowercase=False) 

In [6]:
import difflib

# 1. TDIDF stemmer -> all predictions

In [7]:
def TFIDF_stemmer_all(row):
    tfidf_matrix = tfidf_vectorizer.fit_transform([row.clean_text_tfidf_nltk_stem])
    features_df = pd.DataFrame(tfidf_matrix.T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"])
    return (features_df.sort_values(by=["tfidf"], ascending=False).index.values)

In [8]:
df['TFIDF_stemmer_all'] = df.progress_apply(TFIDF_stemmer_all, axis=1)

100%|██████████| 40862/40862 [01:47<00:00, 381.36it/s]


# 2. TFIDF lemmatizer -> all predictions

In [9]:
def TFIDF_lemmatizer_all(row):
    tfidf_matrix = tfidf_vectorizer.fit_transform([row.clean_text_tfidf_spacy_lemma])
    features_df = pd.DataFrame(tfidf_matrix.T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"])
    return (features_df.sort_values(by=["tfidf"], ascending=False).index.values)

In [10]:
df['TFIDF_lemmatizer_all'] = df.progress_apply(TFIDF_lemmatizer_all, axis=1)

100%|██████████| 40862/40862 [01:45<00:00, 388.94it/s]


# 3. TFIDF stemmer -> top 30 predictions

In [11]:
def TFIDF_stemmer_30(row):
    tfidf_matrix = tfidf_vectorizer.fit_transform([row.clean_text_tfidf_nltk_stem])
    features_df = pd.DataFrame(tfidf_matrix.T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"])
    return (features_df.sort_values(by=["tfidf"], ascending=False).index.values[:30])

In [12]:
df['TFIDF_stemmer_30'] = df.progress_apply(TFIDF_stemmer_30, axis=1)

100%|██████████| 40862/40862 [01:45<00:00, 387.69it/s]


# 4. TFIDF lemmatizer -> top 30 predictions

In [13]:
def TFIDF_lemmatizer_30(row):
    tfidf_matrix = tfidf_vectorizer.fit_transform([row.clean_text_tfidf_spacy_lemma])
    features_df = pd.DataFrame(tfidf_matrix.T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"])
    return (features_df.sort_values(by=["tfidf"], ascending=False).index.values[:30])

In [14]:
df['TFIDF_lemmatizer_30'] = df.progress_apply(TFIDF_lemmatizer_30, axis=1)

100%|██████████| 40862/40862 [01:45<00:00, 388.59it/s]


# Finding matches

In [16]:
def find_matches(truth, predictions):
    """
    Functions compares the truth value (editor tags) and predictions made by the system using the difflib library.
    1. A prediction is considered as a match if it a close match and scores at least 0.7 with an editor tag using the difflib.get_close_matches
    2. If no match is found then we look for a substring match.
    
    If both steps do not give a match, then there is no prediction for that particular editor tag.
    
    :param truth: editor tags
    :param predictions: predictions made by the system
    :return: those predictions which are correct, i.e., match with editor predictions
    """
    
    matches = []
    for tag in truth:
        predicted = difflib.get_close_matches(tag, predictions, 1, 0.7) #closest match
        if not predicted:
            predicted = [key for key in predictions if tag in key] #if no closest match then take substring match
        
        if predicted:
            matches.append(predicted[0])
        else:
            continue
    return matches

## Finding matches between all TF-IDF predictions and editor tags

In [17]:
df['TFIDF_stemmer_all_x_tags'] = df.progress_apply(lambda x: find_matches(x['tags'], x['TFIDF_stemmer_all']), axis=1)
df['TFIDF_lemmatizer_all_x_tags'] = df.progress_apply(lambda x: find_matches(x['tags'], x['TFIDF_lemmatizer_all']), axis=1)

100%|██████████| 40862/40862 [02:05<00:00, 324.52it/s]
100%|██████████| 40862/40862 [02:19<00:00, 293.22it/s]


In [21]:
TFIDF_stemmer_all_x_tags_acc = df.apply(lambda row: int(((len(row.TFIDF_stemmer_all_x_tags)/len(row.tags) if len(row.tags) != 0 else 0 ))*100), axis=1)
TFIDF_lemmatizer_all_x_tags_acc = df.apply(lambda row: int(((len(row.TFIDF_lemmatizer_all_x_tags)/len(row.tags) if len(row.tags) != 0 else 0 ))*100), axis=1)

In [22]:
print(TFIDF_stemmer_all_x_tags_acc.mean())
print(TFIDF_lemmatizer_all_x_tags_acc.mean())

55.79692623953796
66.98316284078116


## Finding matches between all TF-IDF predictions and in_text tags

In [18]:
df['TFIDF_stemmer_all_x_in_text'] = df.progress_apply(lambda x: find_matches(x['in_text'], x['TFIDF_stemmer_all']), axis=1)
df['TFIDF_lemmatizer_all_x_in_text'] = df.progress_apply(lambda x: find_matches(x['in_text'], x['TFIDF_lemmatizer_all']), axis=1)

100%|██████████| 40862/40862 [01:22<00:00, 492.89it/s]
100%|██████████| 40862/40862 [01:31<00:00, 447.48it/s]


In [19]:
TFIDF_stemmer_all_x_in_text_acc = df.apply(lambda row: int(((len(row.TFIDF_stemmer_all_x_in_text)/len(row.in_text) if len(row.in_text) != 0 else -1 ))*100), axis=1)
TFIDF_lemmatizer_all_x_in_text_acc = df.apply(lambda row: int(((len(row.TFIDF_lemmatizer_all_x_in_text)/len(row.in_text) if len(row.in_text) != 0 else -1 ))*100), axis=1)

In [20]:
print(TFIDF_stemmer_all_x_in_text_acc[TFIDF_stemmer_all_x_in_text_acc >= 0].mean())
print(TFIDF_lemmatizer_all_x_in_text_acc[TFIDF_lemmatizer_all_x_in_text_acc >= 0].mean())

76.7309516750667
90.401571301512


## Finding matches between top 30  TF-IDF predictions and editor tags

In [23]:
df['TFIDF_stemmer_30_x_tags'] = df.progress_apply(lambda x: find_matches(x['tags'], x['TFIDF_stemmer_30']), axis=1)
df['TFIDF_lemmatizer_30_x_tags'] = df.progress_apply(lambda x: find_matches(x['tags'], x['TFIDF_lemmatizer_30']), axis=1)

100%|██████████| 40862/40862 [00:25<00:00, 1610.81it/s]
100%|██████████| 40862/40862 [00:27<00:00, 1469.36it/s]


In [24]:
TFIDF_stemmer_30_x_tags_acc = df.apply(lambda row: int(((len(row.TFIDF_stemmer_30_x_tags)/len(row.tags) if len(row.tags) != 0 else 0 ))*100), axis=1)
TFIDF_lemmatizer_30_x_tags_acc = df.apply(lambda row: int(((len(row.TFIDF_lemmatizer_30_x_tags)/len(row.tags) if len(row.tags) != 0 else 0 ))*100), axis=1)

In [25]:
print(TFIDF_stemmer_30_x_tags_acc.mean())
print(TFIDF_lemmatizer_30_x_tags_acc.mean())

31.29151779159121
35.13039009348539


## Finding matches between top 30 TF-IDF predictions and in_text tags

In [26]:
df['TFIDF_stemmer_30_x_in_text'] = df.progress_apply(lambda x: find_matches(x['in_text'], x['TFIDF_stemmer_30']), axis=1)
df['TFIDF_lemmatizer_30_x_in_text'] = df.progress_apply(lambda x: find_matches(x['in_text'], x['TFIDF_lemmatizer_30']), axis=1)

100%|██████████| 40862/40862 [00:17<00:00, 2354.98it/s]
100%|██████████| 40862/40862 [00:18<00:00, 2182.84it/s]


In [29]:
TFIDF_stemmer_30_x_in_text_acc = df.apply(lambda row: int(((len(row.TFIDF_stemmer_30_x_in_text)/len(row.in_text) if len(row.in_text) != 0 else -1 ))*100), axis=1)
TFIDF_lemmatizer_30_x_in_text_acc = df.apply(lambda row: int(((len(row.TFIDF_lemmatizer_30_x_in_text)/len(row.in_text) if len(row.in_text) != 0 else -1 ))*100), axis=1)

In [30]:
print(TFIDF_stemmer_30_x_in_text_acc[TFIDF_stemmer_30_x_in_text_acc >= 0].mean())
print(TFIDF_lemmatizer_30_x_in_text_acc[TFIDF_lemmatizer_30_x_in_text_acc >= 0].mean())

45.15268307144975
49.81329676845538


# Sample article

In [39]:
df.iloc[0].clean_text

'"Hier ist Bayern": Der BR Newsletter informiert Sie immer montags bis freitags zum Feierabend über das Wichtigste vom Tag auf einen Blick kompakt und direkt in Ihrem privaten Postfach. Hier gehts zur Anmeldung! Schon am Dienstag gab es im Iran eine erste feierliche Zeremonie: Der Oberste Führer Ayatollah Ali Khamenei bestätigte die Wahl von Ebrahim Raisi. Alle trugen Maske, saßen vorbildlich in ausreichendem Abstand zueinander. Und Raisi bedauerte: "Höflichkeit und Respekt würden es jetzt eigentlich verlangen, dass ich die gesegnete Hand des Obersten Revolutionsführers küsse, nachdem er mich im Amt bestätigt hat. Aber in der aktuellen Situation wurde mir dieser Erfolg genommen."Corona nur eines der ProblemeErst Anfang der Woche erreichen die täglichen Corona-Neuinfektionen im Iran einen neuen Höchststand mit fast . Fällen. Insgesamt sind seit Beginn der Pandemie nach offiziellen Angaben mehr als . Menschen im Zusammenhang mit dem Virus gestorben. Raisi übernimmt das Land in einer extr

In [41]:
df.iloc[0].TFIDF_lemmatizer_30

array(['Raisi', 'Frage', 'Mensch', 'Land', 'Situation', 'weiß', 'geben',
       'Iran', 'gehen', 'Monat', 'sagen', 'stehen', 'Fall', 'Verhandlung',
       'USA', 'schwer', 'Sanktion', 'Wasser', 'halten', 'lehnen',
       'Mohajeri', 'bestätigen', 'Präsident', 'Problem', 'Erfolg', 'seit',
       'einfach', 'ab', 'dürfen', 'erreichen'], dtype=object)

In [43]:
df.iloc[0].tags

array(['Ayatollah Ali Khamenei', 'Amtseinführung', 'Iran', 'Präsident',
       'Ebrahim Raisi'], dtype=object)