# Load data and python libraries

In [1]:
# data processing libraries
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

# display wider columns in pandas data frames where necessary
pd.set_option('max_colwidth',150)

In [2]:
import tensorflow as tf
print("TensorFlow version:", tf.__version__)

TensorFlow version: 2.2.0


In [3]:
from sklearn.metrics.pairwise import cosine_similarity
import tensorflow_hub as hub

#Load the Universal Sentence Encoder's TF Hub module
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/5"
model = hub.load(module_url)

print ("module %s loaded" % module_url)

module https://tfhub.dev/google/universal-sentence-encoder-large/5 loaded


In [4]:
df_train = pd.read_csv("./transition_files/train.tsv", sep='\t')
print("df_train.shape:", df_train.shape)
print("df_train.shape:",df_train.columns)

df_train.shape: (33982, 12)
df_train.shape: Index(['date', 'author', 'title', 'url', 'section', 'publication',
       'first_10_sents', 'list_of_first_10_sents', 'list_of_verb_lemmas',
       'noun_phrases', 'list_of_nouns', 'list_of_lemmas'],
      dtype='object')


In [5]:
# df_test = pd.read_csv("./transition_files/test.tsv", sep='\t')
# print("df_test.shape:",df_test.shape)
# print("df_test.shape:",df_test.columns)

# Getting text clusters through sentence embedding comparison

In [6]:
def get_embeddings(input):
    return model(input)

In [7]:
def get_text_embeddings(df_data, N_batches):
    #split data into N batches
    N = N_batches

    part = int(len(df_data)/N)
    print(N, "batches with", part + 1, "texts each")

    #get embeddings for each N words
    index = 0
    batch_num = 0
    list_dfs = []

    while index < len(df_data): 
        df_tmp = df_data.iloc[index : index + part].copy()
        df_tmp = df_tmp.reset_index(drop=True)
        print ("Batch number:", batch_num + 1, "out of ", N)

        df_batch_embeddings = pd.DataFrame(get_embeddings(list(df_tmp['first_10_sents'])).numpy())

        num_embeddings = df_batch_embeddings.shape[1]
        df_tmp["emb_vector_length"] = num_embeddings
        columns = ["emb_" + str(i) for i in range(512)]
        df_tmp[columns] = df_batch_embeddings

        list_dfs.append(df_tmp)
        batch_num = batch_num + 1
        index = index + part

    #concatinate batches into single dataset
    df_emb = pd.concat(list_dfs)

    return df_emb

In [8]:
def get_similarity_groups(df_tmp, threshold=0.85):
    columns = ["emb_" + str(i) for i in range(512)]
    group_number = 1
    list_dfs = []

    while len(df_tmp)>0:
        vector = df_tmp[columns].iloc[0].values
        similarities = cosine_similarity(vector.reshape((1, 512)), df_tmp[columns])
        df_tmp['similarity'] = list(similarities.T)

        #select texts that are simlar
        selected_df_IDs = df_tmp[df_tmp['similarity'] >= threshold][['ID']]
        selected_df_IDs['group'] = group_number
        selected_IDs = list(selected_df_IDs['ID'])  
        list_dfs.append(selected_df_IDs)

        #select remaining texts
        group_number = group_number + 1
        df_tmp = df_tmp[df_tmp['similarity'] < threshold]

    return pd.concat(list_dfs)

In [9]:
#grouping articles 
df = df_train.copy()
df = df.reset_index(drop=True)
df['ID'] = df.index

df_data = get_text_embeddings(df, N_batches=100)

100 batches with 340 texts each
Batch number: 1 out of  100
Batch number: 2 out of  100
Batch number: 3 out of  100
Batch number: 4 out of  100
Batch number: 5 out of  100
Batch number: 6 out of  100
Batch number: 7 out of  100
Batch number: 8 out of  100
Batch number: 9 out of  100
Batch number: 10 out of  100
Batch number: 11 out of  100
Batch number: 12 out of  100
Batch number: 13 out of  100
Batch number: 14 out of  100
Batch number: 15 out of  100
Batch number: 16 out of  100
Batch number: 17 out of  100
Batch number: 18 out of  100
Batch number: 19 out of  100
Batch number: 20 out of  100
Batch number: 21 out of  100
Batch number: 22 out of  100
Batch number: 23 out of  100
Batch number: 24 out of  100
Batch number: 25 out of  100
Batch number: 26 out of  100
Batch number: 27 out of  100
Batch number: 28 out of  100
Batch number: 29 out of  100
Batch number: 30 out of  100
Batch number: 31 out of  100
Batch number: 32 out of  100
Batch number: 33 out of  100
Batch number: 34 out

In [10]:
df_data.columns

Index(['date', 'author', 'title', 'url', 'section', 'publication',
       'first_10_sents', 'list_of_first_10_sents', 'list_of_verb_lemmas',
       'noun_phrases',
       ...
       'emb_502', 'emb_503', 'emb_504', 'emb_505', 'emb_506', 'emb_507',
       'emb_508', 'emb_509', 'emb_510', 'emb_511'],
      dtype='object', length=526)

In [11]:
df_text_groups = get_similarity_groups(df_data, threshold=0.3)
df = df.merge(df_text_groups, on=['ID'], how="inner")
df['group_level_1'] = df['group']
del df['group']
print(df.columns)

Index(['date', 'author', 'title', 'url', 'section', 'publication',
       'first_10_sents', 'list_of_first_10_sents', 'list_of_verb_lemmas',
       'noun_phrases', 'list_of_nouns', 'list_of_lemmas', 'ID',
       'group_level_1'],
      dtype='object')


In [12]:
df_text_groups = get_similarity_groups(df_data, threshold=0.50)
df = df.merge(df_text_groups, on=['ID'], how="inner")
df['group_level_2'] = df['group']
del df['group']
print(df.columns)

Index(['date', 'author', 'title', 'url', 'section', 'publication',
       'first_10_sents', 'list_of_first_10_sents', 'list_of_verb_lemmas',
       'noun_phrases', 'list_of_nouns', 'list_of_lemmas', 'ID',
       'group_level_1', 'group_level_2'],
      dtype='object')


In [13]:
df_text_groups = get_similarity_groups(df_data, threshold=0.7)
df = df.merge(df_text_groups, on=['ID'], how="inner")
df['group_level_3'] = df['group']
del df['group']
print(df.columns)

Index(['date', 'author', 'title', 'url', 'section', 'publication',
       'first_10_sents', 'list_of_first_10_sents', 'list_of_verb_lemmas',
       'noun_phrases', 'list_of_nouns', 'list_of_lemmas', 'ID',
       'group_level_1', 'group_level_2', 'group_level_3'],
      dtype='object')


In [14]:
print(df.shape)

# df_train['group'] = df['group']
# df_train.head().T

(33982, 16)


In [15]:
df.to_csv("./data/train_grouped.tsv", sep='\t', index=False)

In [16]:
df['publication'].value_counts()

Wired        14232
CNN          13763
Economist     3153
Gizmodo       2834
Name: publication, dtype: int64

In [17]:
pd.DataFrame(df.groupby('group_level_1')['group_level_2'].nunique().describe(percentiles=[0.05,
                                                                             0.15,0.25,0.5,0.75,
                                                                                          0.85,0.95])).T

Unnamed: 0,count,mean,std,min,5%,15%,25%,50%,75%,85%,95%,max
group_level_2,573.0,24.513089,93.947344,1.0,1.0,1.0,1.0,4.0,14.0,27.0,100.4,1416.0


In [18]:
pd.DataFrame(df.groupby(['group_level_1','group_level_2'])['group_level_3'].nunique().describe(percentiles=[0.05,
                                                                             0.15,0.25,0.5,0.75,
                                                                                          0.85,0.95])).T

Unnamed: 0,count,mean,std,min,5%,15%,25%,50%,75%,85%,95%,max
group_level_3,14046.0,2.045351,5.399248,1.0,1.0,1.0,1.0,1.0,1.0,2.0,5.0,158.0


In [19]:
print("Number of news per second level group:")
pd.DataFrame(df.groupby(['group_level_1',
                         'group_level_2'])['first_10_sents'].count().describe(percentiles=[0.05,
                                                                             0.15,0.25,0.5,0.75,
                                                                                          0.85,0.95])).T

Number of news per second level group:


Unnamed: 0,count,mean,std,min,5%,15%,25%,50%,75%,85%,95%,max
first_10_sents,14046.0,2.419336,9.372464,1.0,1.0,1.0,1.0,1.0,1.0,2.0,6.0,339.0


In [20]:
print("Number of news per third level group:")
pd.DataFrame(df.groupby(['group_level_1',
                         'group_level_2',
                         'group_level_3'])['first_10_sents'].count().describe(percentiles=[0.05,
                                                                             0.15,0.25,0.5,0.75,
                                                                                          0.85,0.95])).T

Number of news per third level group:


Unnamed: 0,count,mean,std,min,5%,15%,25%,50%,75%,85%,95%,max
first_10_sents,28729.0,1.182847,1.34912,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,89.0


In [21]:
print("Done")

Done
