In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
data = pd.read_csv('../data/combined_data_v6.csv')
data['headline'] = data['headline'].astype('U')
data['year'] = data['year'].astype(int)
data = data.drop_duplicates().reset_index(drop = True)
data.shape

(439412, 5)

In [8]:
data.sample(n = 10)

Unnamed: 0,source,headline,year,date,is_sarcastic
61456,ABC Australia,cyclone nora to cross queensland coast sunday ...,2018,2018-03-24,0
26266,Huffington Post,a popular voting reform could add 22 million a...,2019,2019-01-01,0
162239,atlantic,The Jeffrey Epstein–Victoria’s Secret Connection,2019,2019-08-06,0
182530,atlantic,Searching for the Black Trump Supporter,2016,2016-10-01,0
152207,guardian,To e or not to e? US statue sparks debate over...,2017,2017-08-24,0
7970,Huffington Post,Here's What You Need To Know About Obamacare E...,2017,2017-11-01,0
172637,atlantic,Who Survives a Sexual-Harassment Allegation?,2017,2017-11-30,0
75944,ABC Australia,british prime minister theresa may announces r...,2019,2019-05-24,0
147824,guardian,Late-night hosts: Trump ally Stephen Miller 't...,2018,2018-01-25,0
219061,Fox News,"US officials 'concerned' as Iran, Russia plan ...",2016,2016-11-15,0


### Preprocessing Steps
1. Standardize common abbreviations* (WIP)
    1. u.s. --> usa
1. Lowercase
1. Expand Contractions
1. Optional:
    1. Remove Source Specific Language
    1. Remove Profanity
1. Remove Special Characters


In [9]:
def replace_words(text, replace_dict):
    tokens = []
    for w in text.split():
        word = w
        for t in replace_dict.keys():
            if w == t:
                word = replace_dict[t]
        tokens.append(word)
    
    return " ".join(tokens)

In [10]:
data['clean'] = data['headline']

## Standardize Common Abbreviations
translate_dict = {
    "US": "USA",
    "U.S.": "USA",
    "u.s.": "USA",
    "u.s": "USA",
}



data['clean'] = data['clean'].apply(lambda x: replace_words(x, translate_dict))

In [11]:
# Lowercase
data['clean'] = data['clean'].apply(lambda x: x.lower())

In [12]:
# Expand Contractions
contractions_dict = { "ain't": "are not","aren't": "are not",
                     "can't": "cannot","can't've": "cannot have",
                     "'cause": "because","could've": "could have","couldn't": "could not",
                     "couldn't've": "could not have", "didn't": "did not","doesn't": "does not",
                     "don't": "do not","hadn't": "had not","hadn't've": "had not have",
                     "hasn't": "has not","haven't": "have not","he'd": "he would",
                     "he'd've": "he would have","he'll": "he will", "he'll've": "he will have",
                     "how'd": "how did","how'd'y": "how do you","how'll": "how will",
                     "I'd": "I would", "I'd've": "I would have","I'll": "I will",
                     "I'll've": "I will have","I'm": "I am","I've": "I have", "isn't": "is not",
                     "it'd": "it would","it'd've": "it would have","it'll": "it will",
                     "it'll've": "it will have", "let's": "let us","ma'am": "madam",
                     "mayn't": "may not","might've": "might have","mightn't": "might not", 
                     "mightn't've": "might not have","must've": "must have","mustn't": "must not",
                     "mustn't've": "must not have", "needn't": "need not",
                     "needn't've": "need not have","o'clock": "of the clock","oughtn't": "ought not",
                     "oughtn't've": "ought not have","shan't": "shall not","sha'n't": "shall not",
                     "shan't've": "shall not have","she'd": "she would","she'd've": "she would have",
                     "she'll": "she will", "she'll've": "she will have","should've": "should have",
                     "shouldn't": "should not", "shouldn't've": "should not have","so've": "so have",
                     "that'd": "that would","that'd've": "that would have", "there'd": "there would",
                     "there'd've": "there would have", "they'd": "they would",
                     "they'd've": "they would have","they'll": "they will",
                     "they'll've": "they will have", "they're": "they are","they've": "they have",
                     "to've": "to have","wasn't": "was not","we'd": "we would",
                     "we'd've": "we would have","we'll": "we will","we'll've": "we will have",
                     "we're": "we are","we've": "we have", "weren't": "were not","what'll": "what will",
                     "what'll've": "what will have","what're": "what are", "what've": "what have",
                     "when've": "when have","where'd": "where did", "where've": "where have",
                     "who's": "who is",
                     "who'll": "who will","who'll've": "who will have","who've": "who have",
                     "why've": "why have","will've": "will have","won't": "will not",
                     "won't've": "will not have", "would've": "would have","wouldn't": "would not",
                     "wouldn't've": "would not have","y'all": "you all", "y'all'd": "you all would",
                     "y'all'd've": "you all would have","y'all're": "you all are",
                     "y'all've": "you all have", "you'd": "you would","you'd've": "you would have",
                     "you'll": "you will","you'll've": "you will have", "you're": "you are",
                     "you've": "you have"}



data['clean'] = data['clean'].apply(lambda x: replace_words(x, contractions_dict))


In [13]:
data.loc[data['headline'].str.contains("'ll"), ['headline','clean']]

Unnamed: 0,headline,clean
287,5 Spring 2016 Beauty Runway Trends You'll Actu...,5 spring 2016 beauty runway trends you will ac...
824,What Your Movements May Reveal About How You'l...,what your movements may reveal about how you w...
978,Friday Talking Points -- It's My Party and I'l...,friday talking points -- it's my party and i'l...
1200,Not Everything Is Terrible: There's Now 'Tipsy...,not everything is terrible: there's now 'tipsy...
1221,Donald Trump Says He'll Cancel Boeing's Air Fo...,donald trump says he will cancel boeing's air ...
...,...,...
245180,Maryland authorities say they'll investigate K...,maryland authorities say they will investigate...
245616,Elizabeth Warren says she'll wear Planned Pare...,elizabeth warren says she will wear planned pa...
245883,Mom had heart attack while driving kids to sch...,mom had heart attack while driving kids to sch...
245902,Paula White on how she became Trump's spiritua...,paula white on how she became trump's spiritua...


In [14]:
### Remove bad text:

def remove_artifacts(s):
    return re.sub('xe[0-9]+x[0-9]+x[0-9]+', '',s)

data['clean'] = data['clean'].apply(lambda x: remove_artifacts(x))

In [15]:
data.loc[data['headline'].str.contains("xe"), ['headline','clean']]

Unnamed: 0,headline,clean
28,"Banking Doesn't Have To Be A Boys' Club, Bank ...","banking does not have to be a boys' club, bank..."
188,'The President Show' Signs A Bunch of Executiv...,'the president show' signs a bunch of executiv...
448,New York City Axes English Language Requiremen...,new york city axes english language requiremen...
538,Real Health Care Fixes,real health care fixes
539,Syrian Activist Bassel Safadi Was Executed In ...,syrian activist bassel safadi was executed in ...
...,...,...
246624,"Wildfires threaten firefighters, homes across ...","wildfires threaten firefighters, homes across ..."
246632,Biden tells Pelosi and Schumer xe2x80x98mi cas...,"biden tells pelosi and schumer mi casa, you casa"
246633,Director insists xe2x80x98The Young Messiahxe2...,director insists the young messiah does not co...
246650,Brett Young shares hilarious story about wifex...,brett young shares hilarious story about wifes...


In [16]:
## Remove Special Characters
data['clean'] = data['clean'].apply(lambda x: re.sub('[^A-Za-z0-9 ]+', '', x))

In [3]:
guardian_length = data[data.source == 'guardian'].shape[0]

Huff = data[data.source == 'Huffington Post']
ABC = data[data.source == 'ABC Australia']

other_sources = data[(data.source != 'Huffington Post') & (data.source != 'ABC Australia')]

Huff = Huff.sample(n=guardian_length)
ABC = ABC.sample(n=guardian_length)

data_sources = [Huff, ABC, other_sources]
data = pd.concat(data_sources)
data.reset_index(drop=True, inplace = True)
data.shape

In [18]:
data[['source', 'year', 'clean', 'is_sarcastic']].to_csv('../data/combined_clean_v6.csv', index = False, sep = "|")

In [19]:
# df = data[['source', 'year', 'clean', 'is_sarcastic']]
# df.columns = ['source','year','headline','is_sarcastic']

In [20]:
# df.to_csv('../data/combined_data_v4.csv',index = False)