In [145]:
import pandas as pd
import numpy as np
import random

from itertools import chain
from sklearn.base import BaseEstimator, TransformerMixin
from nltk.corpus import wordnet


### Get data

In [13]:
df = pd.read_csv('../data/data.csv')
df = df[:100]

In [93]:
df.head()

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral


In [30]:
complete_text = " ".join(df['Sentence'])
words = [word.lower() for sentence in complete_text.split('.') for word in sentence.split(' ')]
# print(words)
df_freq = pd.value_counts(np.array(words)).reset_index()

In [34]:
len(words) * 0.025

57.150000000000006

### Synonym Replacement

In [131]:
class SynonymReplacement(BaseEstimator, TransformerMixin):

    """ 
    We want to perform synonym replacement with words that are meaningful. To 
    determine if a word if meaningful, we will use Zipft law
    """

    def __init__(self, max_synonyms = 3, perc_replacements = 0.025):
        """ 
        Parameters 
        ----------
        max_synonyms: int
            maximum of synonyms for each words
        perc_replacements: float
            percentage of words to be replaced
        """
        self.max_synonyms = max_synonyms
        self.perc_replacements = perc_replacements

    def fit(self, X, y=None):
        return self


    def transform(self, documents, labels): # TODO
        # 1. get list of words to replace with zipft law
        synonyms = self._get_words_to_replace(documents)

        # 2. get data augmentation (everytime we see word in list, we augment)
        new_documents = []
        synonyms_dict = {}
        for sentence, label in zip(documents, labels): 
            new_documents.append([sentence, label]) # add old sentence
            # add new sentences with synonym replacement
            words = sentence.split(' ')
            for i, word in enumerate(words):
                if word in synonyms:
                    # find list of synonym for word in not in dict
                    if word not in synonyms_dict: 
                        synonyms_dict[word] = self._get_synonyms(word)

                    # data augmentation
                    for synonym in synonyms_dict[word]: 
                        new_sentence_words = words[0:i-1] + [synonym] + words[0: i+1]\
                            if i + 1 < len(sentence) else words[0:i-1] + [synonym]
                        new_sentence = " ".join(new_sentence_words)
                        new_documents.append([new_sentence, label])


        return new_documents

    def _get_num_of_words_to_replace(self, documents): 
        """ 
        Function that return how many words to replace
        len(vocab) * perc
        """
        words = [word.lower() for sentence in complete_text.split('.') for word in sentence.split(' ')]
        return int(len(words) * self.perc_replacements)
        

    def _get_words_to_replace(self, documents): 
        """ 
        Function that get list of words to perform data augmentation on. 
        We choose the bottom from zipft law
        """
        # get word frequency count
        df_freq = self._get_word_frequency(documents)

        # get num words to find synonym
        num_words_to_replace = self._get_num_of_words_to_replace(documents)

        # find the list of words to replace
        words_to_replace = list(df_freq[:num_words_to_replace]['word'])

        return words_to_replace
        

    def _get_word_frequency(self, documents): 
        """ 
        Function that count word frequency 

        Parameters
        ----------
        documents: list of string

        Returns
        -------
        df: pandas dataframe
            dataframe with word frequency (col: word, count) in decreasing order
        """
        complete_text = " ".join(documents)
        words = [word.lower() for sentence in complete_text.split('.') for word in sentence.split(' ')]

        # remove selected words
        to_remove = [' ', ',', '', '-', '``', "'"]
        words = [word for word in words if word not in to_remove]
        
        freq = pd.value_counts(np.array(words)).rename_axis('word').reset_index(name='count')
        freq = freq.sort_values(by=['count'], ascending=True)

        return freq


    def _get_synonyms(self, word):
        """ 
        Function that return list of synonym using wordnet model

        Parameters
        ----------
        word: string
            word to find synonym for

        Returns
        -------
        synonym: list of string
            list of synonym. Include initial word
        """
        # FIXME: why does list change every time function is run

        # get all synonyms from wordnet
        synonym_tokens = wordnet.synsets(word)
        synonyms = list(set(chain.from_iterable([word.lemma_names() for word in synonym_tokens])))

        # get top synonym (current: first 5)
        top_synonyms = synonyms[:self.max_synonyms]

        # clean: replace '_' by ' '
        top_synonyms = [word.replace('_', ' ') for word in top_synonyms]

        return top_synonyms


In [132]:
replacement = SynonymReplacement()
replacement._get_word_frequency(df['Sentence'])

Unnamed: 0,word,count
459,estonia,1
605,fish,1
606,30%,1
607,co/ir2f,1
608,l+ñnnen,1
...,...,...
4,a,43
3,in,49
2,to,52
1,of,68


In [133]:
new_sentences = replacement.transform(df['Sentence'], df['Sentiment'])

In [134]:
print(len(new_sentences))
print(len(df['Sentence']))

165
100


In [142]:
[sentence for i in range(3) for sentence in list(df['Sentence'])]

["The GeoSolutions technology will leverage Benefon 's GPS solutions by providing Location Based Search Technology , a Communities Platform , location relevant multimedia content and a new and powerful commercial model .",
 '$ESI on lows, down $1.50 to $2.50 BK a real possibility',
 "For the last quarter of 2010 , Componenta 's net sales doubled to EUR131m from EUR76m for the same period a year earlier , while it moved to a zero pre-tax profit from a pre-tax loss of EUR7m .",
 'According to the Finnish-Russian Chamber of Commerce , all the major construction companies of Finland are operating in Russia .',
 'The Swedish buyout firm has sold its remaining 22.4 percent stake , almost eighteen months after taking the company public in Finland .',
 "$SPY wouldn't be surprised to see a green close",
 "Shell's $70 Billion BG Deal Meets Shareholder Skepticism",
 'SSH COMMUNICATIONS SECURITY CORP STOCK EXCHANGE RELEASE OCTOBER 14 , 2008 AT 2:45 PM The Company updates its full year outlook and 

### Random Word Deletion

In [158]:
class RandomWordDeletion(BaseEstimator, TransformerMixin):

    def __init__(self, num_duplicates, num_deletions):
        """ 
        Arguments
        ---------
        num_duplicates: int
            how many time we duplicate dataset
        num_deletions: int
            num of token to delete
        """
        self.num_duplicates = num_duplicates
        self.num_deletions = num_deletions

    def fit(self, X, y):
        return self

    def transform(self, documents, labels):
        # 1. duplicate documents
        documents, labels = list(documents), list(labels)
        duplicate_docs = [ sentence for i in range(self.num_duplicates) 
                for sentence in documents]
        duplicate_labels = [ label for i in range(self.num_duplicates) 
                for label in labels]

        # 2. perform word deletion
        new_docs = []
        for sentence, label in zip(duplicate_docs, duplicate_labels):
            words = sentence.split(' ')
            num_words_removed = 0
            while words and num_words_removed < self.num_deletions:
                # remove random index
                index_to_remove = random.randrange(len(words))
                words.pop(index_to_remove)
                num_words_removed += 1
            new_sentence = " ".join(words)
            if new_sentence: # make sure that sentence is at least one word 
                new_docs.append([new_sentence, label])

        return new_docs


In [160]:
word_del = RandomWordDeletion(2, 4)
word_del.transform(df['Sentence'], df['Sentiment'])

[["The GeoSolutions technology will leverage 's GPS solutions by providing Based Search Technology , a Communities Platform , relevant multimedia content and a new and powerful commercial model",
  'positive'],
 ['$ESI lows, down $2.50 a real possibility', 'negative'],
 ["For the last quarter 2010 , Componenta 's net sales doubled EUR131m from EUR76m for the a year earlier , while it moved to a zero pre-tax profit from a pre-tax loss of EUR7m .",
  'positive'],
 ['According to the Finnish-Russian of Commerce , all the major construction companies of Finland are .',
  'neutral'],
 ['The Swedish buyout has sold its remaining 22.4 percent , almost eighteen months after taking the public Finland .',
  'neutral'],
 ["wouldn't surprised to green close", 'positive'],
 ['$70 Billion Deal Skepticism', 'negative'],
 ['SSH COMMUNICATIONS SECURITY CORP STOCK EXCHANGE RELEASE 14 2008 AT 2:45 The Company updates its full year outlook and estimates its results to remain at for the full year .',
  'ne

### Character Deletion

In [174]:
class RandomCharacterDeletion(BaseEstimator, TransformerMixin):

    def __init__(self, num_duplicates, num_deletions):
        """ 
        Arguments
        ---------
        num_duplicates: int
            how many time we duplicate dataset
        num_deletions: int
            num of token to delete
        """
        self.num_duplicates = num_duplicates
        self.num_deletions = num_deletions

    def fit(self, X):
        return self

    def transform(self, documents, labels):
        # 1. duplicate documents
        documents, labels = list(documents), list(labels)
        duplicate_docs = [ sentence for i in range(self.num_duplicates) 
                for sentence in documents]
        duplicate_labels = [ label for i in range(self.num_duplicates) 
                for label in labels]

        # 2. perform character deletion
        new_docs = []
        for sentence, label in zip(duplicate_docs, duplicate_labels):
            characters = [c for c in sentence]
            num_char_removed = 0
            while words and num_char_removed < self.num_deletions:
                # remove random index
                index_to_remove = random.randrange(len(characters))
                characters.pop(index_to_remove)
                num_char_removed += 1
            new_sentence = "".join(characters)
            if new_sentence: # make sure that sentence is at least one word 
                new_docs.append([new_sentence, label])

        return new_docs



In [175]:
char_del = RandomCharacterDeletion(2, 5)
char_del.transform(df['Sentence'], df['Sentiment'])

[["The GeoSolutions technology will leverage Benefon ' GPS solutions by providing Location Based Seach Technoogy , a Communities Platform ,location relevant multimedia content and a new and powerful ommercial model .",
  'positive'],
 ['ESI on lws, down $150to $250 BK a real possibility', 'negative'],
 ["Fr the last quarer of 2010 , omponenta 's net sales doubled to EUR131m from EUR76m fo the same period a year earlier , while it moved to a zero pe-tax profit from a pre-tax loss of EUR7m .",
  'positive'],
 ['Accordig to th Finish-Russian Chaber of Commerce , all the major construction companies of Finland ar operating in Russia .',
  'neutral'],
 ['The Swedish buyot firm has sod its reaining 22.4 percent stake, almost eighten months after taking the company public in Finland .',
  'neutral'],
 ["$SPY wouldn'tbe surprised to ee a green os", 'positive'],
 ["Shell's $70 Billio G Deal eets Shareholder Spticism", 'negative'],
 ['SSH COMMUNCATIONS SECURITY CORP STOCK EXCHANGE RELEASE OCTOBE

### Word Swap

In [179]:
class RandomWordSwap(BaseEstimator, TransformerMixin):

    def __init__(self, num_duplicates, num_swaps):
        """ 
        Arguments
        ---------
        num_duplicates: int
            how many time we duplicate dataset
        num_swaps: int
            num of token to swap
        """
        self.num_duplicates = num_duplicates
        self.num_swaps = num_swaps

    def fit(self, X, y):
        return self

    def transform(self, documents, labels):
        # 1. duplicate documents
        documents, labels = list(documents), list(labels)
        duplicate_docs = [ sentence for i in range(self.num_duplicates) 
                for sentence in documents]
        duplicate_labels = [ label for i in range(self.num_duplicates) 
                for label in labels]

        new_docs = []
        # 2. perform word swap
        for sentence, label in zip(duplicate_docs, duplicate_labels):
            new_sentence = self._swap_words_in_sentence(sentence)
            new_docs.append([new_sentence, label])

        return new_docs

    def _swap_words_in_sentence(self, sentence):
        """ 
        Given a sentence (string), return a new sentence where words were 
        swap n times
        """
        words = sentence.split(' ')
        for _ in range(self.num_swaps):
            # generate random index swap
            index = random.randrange(len(words) - 1)
            words[index], words[index + 1] = words[index + 1], words[index]
        return " ".join(words)



In [180]:
word_swap = RandomWordSwap(2, 3)
word_swap.transform(df['Sentence'], df['Sentiment'])

[["The GeoSolutions technology leverage will 's Benefon GPS solutions by providing Location Based Search , Technology a Communities Platform , location relevant multimedia content and a new and powerful commercial model .",
  'positive'],
 ['on $ESI lows, $1.50 down to $2.50 BK a possibility real', 'negative'],
 ["For the last quarter of 2010 , Componenta 's net sales to doubled EUR131m from EUR76m the for same period a year earlier , while it moved to a pre-tax zero profit from a pre-tax loss of EUR7m .",
  'positive'],
 ['According to the Chamber Finnish-Russian Commerce of , all the major construction of companies Finland are operating in Russia .',
  'neutral'],
 ['The Swedish buyout firm has sold its remaining 22.4 percent stake , almost eighteen after months taking the public in company Finland .',
  'neutral'],
 ["wouldn't $SPY be surprised to a see close green", 'positive'],
 ["$70 Shell's BG Billion Meets Deal Shareholder Skepticism", 'negative'],
 ['SSH COMMUNICATIONS CORP SE

### Character Swap

In [181]:
class RandomCharacterSwap(BaseEstimator, TransformerMixin):

    def __init__(self, num_duplicates, num_swaps):
        """ 
        Arguments
        ---------
        num_duplicates: int
            how many time we duplicate dataset
        num_swaps: int
            num of token to swap
        """
        self.num_duplicates = num_duplicates
        self.num_swaps = num_swaps

    def fit(self, X, y):
        return self

    def transform(self, documents, labels):
        # 1. duplicate documents
        documents, labels = list(documents), list(labels)
        duplicate_docs = [ sentence for i in range(self.num_duplicates) 
                for sentence in documents]
        duplicate_labels = [ label for i in range(self.num_duplicates) 
                for label in labels]

        # 2. perform character swap
        new_docs = []
        for sentence, label in zip(duplicate_docs, duplicate_labels):
            new_sentence = self._swap_characters_in_sentence(sentence)
            new_docs.append([new_sentence, label])

        return new_docs

    def _swap_characters_in_sentence(self, sentence):
        chars = [c for c in sentence]
        for _ in range(self.num_swaps):
            index = random.randrange(len(chars) - 1)
            chars[index], chars[index+1] = chars[index+1], chars[index]
        return "".join(chars)

        

In [183]:
char_swap = RandomCharacterSwap(2, 5)
char_swap.transform(df['Sentence'], df['Sentiment'])

[["The GeoSolutions technology wlil leverage Benefon 's GPS solutions by providing Location Based Search Technolgoy , a Communities Platform , location relevant multimedia content anda  newa nd powerful commecrial model .",
  'positive'],
 ['$ESIo n lows, down $1.50 t o$2.50B K a real possibility', 'negative'],
 ["For the lastq uarter of 2010 , Component a's ne tsales doubled to EUR311m from EUR76m for the same period a year aerlier , while it moved to a zero pre-tax profit from a pre-tax loss of EUR7m .",
  'positive'],
 ['cAcording to the Finnish-Russian Chamber of Commerce , all the major construction ocmpanies of Finland areo peraitng ni Russia .',
  'neutral'],
 ['hTe Sewdish buoyut firm has sold its remaining 22.4 percent stake , almos teighteen months after takingt he company public in Finland .',
  'neutral'],
 ["$SYP wuoldn't be surprised to see a rgeen close", 'positive'],
 ["hSell's $70 Billio nBG Deal Meest Shareohlder Skeptciism", 'negative'],
 ['SSH COMMUNICATINOS SECRUIT

# Create New Dataset using data augmentation

In [184]:
dataset_synonym_replacement = SynonymReplacement().transform(df['Sentence'], df['Sentiment'])
dataset_word_deletion = RandomWordDeletion(2, 2).transform(df['Sentence'], df['Sentiment'])
dataset_char_deletion = RandomCharacterDeletion(2, 5).transform(df['Sentence'], df['Sentiment'])
dataset_word_deletion = RandomWordSwap(2, 2).transform(df['Sentence'], df['Sentiment'])
dataset_char_deletion = RandomCharacterSwap(2, 5).transform(df['Sentence'], df['Sentiment'])

In [187]:
new_dataset = dataset_synonym_replacement + dataset_word_deletion\
    + dataset_char_deletion + dataset_word_deletion + dataset_char_deletion
new_dataset = pd.DataFrame(new_dataset, columns=['Sentence', 'Sentiment'])

In [188]:
new_dataset

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral
...,...,...
960,M-Real said there are ` no gruonds ' for the r...,neutral
961,Lember said the matter was topical aslo in Est...,neutral
962,- Moody 's said it gave P A1 long-term senior ...,neutral
963,T$SLAr eclla,negative
