# Sentence Embedding Script -- Reddit News Header Data
## By: Oliver Hamilton, Tim O'Brien, and Gabriel Ting
---

In [1]:
import tensorflow as tf
import tensorflow_hub
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
universal_sentence_encoder_url = "https://tfhub.dev/google/universal-sentence-encoder/4" 
sentence_encoder = tensorflow_hub.load(universal_sentence_encoder_url)
print("module loaded: " + universal_sentence_encoder_url)

module loaded: https://tfhub.dev/google/universal-sentence-encoder/4


In [3]:
# This function cleans off all 'b's, blank characters, and leading " or ' characters from the news article headers.
def clean_header(header):
    i, j = 0, len(header) - 1
    while header[i] in """b'" """:  # Iterable with: 'b', ' ', ", and '.
        i += 1
    while header[j] in """'" """:  # Iterable with: ', ", and ' '.
        j -= 1
    return header[i:j+1]

In [6]:
combined_dataset = pd.read_csv("./data/Combined_News_DJIA.csv")
label_column = combined_dataset["Label"]
date_column = combined_dataset["Date"]
combined_dataset = combined_dataset.drop(columns = ["Label", "Date"])
combined_dataset = combined_dataset.applymap(lambda x: clean_header(str(x)))
combined_dataset.head()

Unnamed: 0,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,Top9,Top10,...,Top16,Top17,Top18,Top19,Top20,Top21,Top22,Top23,Top24,Top25
0,Georgia 'downs two Russian warplanes' as count...,BREAKING: Musharraf to be impeached.,Russia Today: Columns of troops roll into Sout...,Russian tanks are moving towards the capital o...,"Afghan children raped with 'impunity,' U.N. of...",150 Russian tanks have entered South Ossetia w...,"Breaking: Georgia invades South Ossetia, Russi...",The 'enemy combatent' trials are nothing but a...,Georgian troops retreat from S. Osettain capit...,Did the U.S. Prep Georgia for War with Russia?,...,Georgia Invades South Ossetia - if Russia gets...,Al-Qaeda Faces Islamist Backlash,"Condoleezza Rice: ""The US would not act to pre...",This is a busy day: The European Union has ap...,"Georgia will withdraw 1,000 soldiers from Iraq...",Why the Pentagon Thinks Attacking Iran is a Ba...,Caucasus in crisis: Georgia invades South Ossetia,Indian shoe manufactory - And again in a seri...,Visitors Suffering from Mental Illnesses Banne...,No Help for Mexico's Kidnapping Surge
1,Why wont America and Nato help us? If they won...,Bush puts foot down on Georgian conflict,Jewish Georgian minister: Thanks to Israeli tr...,Georgian army flees in disarray as Russians ad...,Olympic opening ceremony fireworks 'faked,What were the Mossad with fraudulent New Zeala...,Russia angered by Israeli military sale to Geo...,An American citizen living in S.Ossetia blames...,Welcome To World War IV! Now In High Definition!,"Georgia's move, a mistake of monumental propor...",...,Israel and the US behind the Georgian aggression?,"Do not believe TV, neither Russian nor Georgia...",Riots are still going on in Montreal (Canada) ...,China to overtake US as largest manufacturer,War in South Ossetia [PICS],Israeli Physicians Group Condemns State Torture,Russia has just beaten the United States over ...,Perhaps *the* question about the Georgia - Rus...,Russia is so much better at war,So this is what it's come to: trading sex for ...
2,Remember that adorable 9-year-old who sang at ...,Russia 'ends Georgia operation,If we had no sexual harassment we would have n...,Al-Qa'eda is losing support in Iraq because of...,Ceasefire in Georgia: Putin Outmaneuvers the West,Why Microsoft and Intel tried to kill the XO $...,Stratfor: The Russo-Georgian War and the Balan...,I'm Trying to Get a Sense of This Whole Georgi...,The US military was surprised by the timing an...,U.S. Beats War Drum as Iran Dumps the Dollar,...,U.S. troops still in Georgia (did you know the...,Why Russias response to Georgia was right,"Gorbachev accuses U.S. of making a ""serious bl...","Russia, Georgia, and NATO: Cold War Two",Remember that adorable 62-year-old who led you...,War in Georgia: The Israeli connection,All signs point to the US encouraging Georgia ...,Christopher King argues that the US and NATO a...,America: The New Mexico?,BBC NEWS | Asia-Pacific | Extinction 'by man n...
3,U.S. refuses Israel weapons to attack Iran: re...,When the president ordered to attack Tskhinval...,Israel clears troops who killed Reuters cameraman,"Britain\'s policy of being tough on drugs is ""...",Body of 14 year old found in trunk; Latest (ra...,China has moved 10 *million* quake survivors i...,Bush announces Operation Get All Up In Russia'...,Russian forces sink Georgian ships,The commander of a Navy air reconnaissance squ...,92% of CNN readers: Russia's actions in Georgi...,...,Elephants extinct by 2020?,US humanitarian missions soon in Georgia - if ...,Georgia's DDOS came from US sources,"Russian convoy heads into Georgia, violating t...",Israeli defence minister: US against strike on...,Gorbachev: We Had No Choice,Witness: Russian forces head towards Tbilisi i...,Quarter of Russians blame U.S. for conflict: poll,Georgian president says US military will take...,2006: Nobel laureate Aleksander Solzhenitsyn a...
4,All the experts admit that we should legalise ...,War in South Osetia - 89 pictures made by a Ru...,Swedish wrestler Ara Abrahamian throws away me...,Russia exaggerated the death toll in South Oss...,Missile That Killed 9 Inside Pakistan May Have...,Rushdie Condemns Random House's Refusal to Pub...,Poland and US agree to missle defense deal. In...,"Will the Russians conquer Tblisi? Bet on it, n...","Russia exaggerating South Ossetian death toll,...",Musharraf expected to resign rather than face ...,...,Bank analyst forecast Georgian crisis 2 days e...,Georgia confict could set back Russia's US rel...,War in the Caucasus is as much the product of ...,"Non-media"" photos of South Ossetia/Georgia con...",Georgian TV reporter shot by Russian sniper du...,Saudi Arabia: Mother moves to block child marr...,Taliban wages war on humanitarian aid workers,"Russia: World ""can forget about"" Georgia\'s t...",Darfur rebels accuse Sudan of mounting major a...,Philippines : Peace Advocate say Muslims need ...


Below, in the most important part of the script, we loop over the first 10 columns of our dataset which represent the top 10 news story headers on Reddit each day. For each column and for each row item in that column, we formulate the sentence embedding using the universal sentence encoder and store the column transformation result in a dictionary. Then, we create a DataFrame object whose 10 columns represent the embeddings of the first 10 columns in the original dataset.

In [8]:
embeddings_dict = {}

# Because of the fixed size of the embedding vectors 2KB/sentence, we will only use the top 10 news stories 
for col_name, col_data in list(combined_dataset.iteritems())[:10]:
    print("Creating sentence embeddings for colunm: " + col_name)
    news_headings = col_data.values.tolist()
    col_embeddings = sentence_encoder(news_headings).numpy()
    print("Created sentence embeddings for column: " + col_name)
    for feature in range(512):
        embeddings_dict[col_name + "_feature_" + str(feature)] = pd.Series(col_embeddings[:, feature])
    
embedding_dataset = pd.DataFrame(embeddings_dict)

Creating sentence embeddings for colunm: Top1
Created sentence embeddings for column: Top1
Creating sentence embeddings for colunm: Top2
Created sentence embeddings for column: Top2
Creating sentence embeddings for colunm: Top3
Created sentence embeddings for column: Top3
Creating sentence embeddings for colunm: Top4
Created sentence embeddings for column: Top4
Creating sentence embeddings for colunm: Top5
Created sentence embeddings for column: Top5
Creating sentence embeddings for colunm: Top6
Created sentence embeddings for column: Top6
Creating sentence embeddings for colunm: Top7
Created sentence embeddings for column: Top7
Creating sentence embeddings for colunm: Top8
Created sentence embeddings for column: Top8
Creating sentence embeddings for colunm: Top9
Created sentence embeddings for column: Top9
Creating sentence embeddings for colunm: Top10
Created sentence embeddings for column: Top10


In [9]:
embedding_dataset.insert(0, "Date", date_column)
embedding_dataset.head()

Unnamed: 0,Date,Top1_feature_0,Top1_feature_1,Top1_feature_2,Top1_feature_3,Top1_feature_4,Top1_feature_5,Top1_feature_6,Top1_feature_7,Top1_feature_8,...,Top10_feature_502,Top10_feature_503,Top10_feature_504,Top10_feature_505,Top10_feature_506,Top10_feature_507,Top10_feature_508,Top10_feature_509,Top10_feature_510,Top10_feature_511
0,2008-08-08,0.045079,-0.022705,-0.054172,0.004627,-0.072887,0.006421,-0.013796,-0.033878,0.0168,...,0.037529,0.079555,0.0079,0.052349,-0.03285,-0.024886,-0.008362,-0.008685,-0.037892,-0.04178
1,2008-08-11,0.019073,-0.077482,-0.030234,-0.024577,-0.029507,0.055582,-0.009761,-0.014719,-0.087299,...,0.035775,0.010439,0.047468,0.081217,-0.042919,0.047936,-0.039455,-0.068909,-0.049422,-0.051906
2,2008-08-12,0.016142,0.059271,0.054074,-0.040012,-0.067032,0.061725,-0.013906,0.037723,-0.019361,...,0.016392,0.075807,-0.016552,0.010285,0.000613,-0.035458,-0.051954,-0.010246,-0.074031,-0.072625
3,2008-08-13,-0.032401,-0.064624,0.020649,0.00584,0.006389,0.054969,0.016002,0.005557,-0.024299,...,0.037088,0.064971,0.013768,0.051682,-0.05072,0.031203,0.008062,-0.045218,-0.021317,-0.054575
4,2008-08-14,0.022485,-0.027689,0.041283,0.037246,0.039393,0.002239,0.049105,-0.047468,-0.066502,...,0.048365,0.055074,0.078453,0.032827,-0.036714,0.023838,0.047548,0.054426,-0.072552,-0.024525


In [10]:
embedding_csv = embedding_dataset.to_csv("./data/embeddings_dataset.csv", index = False)