In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import nltk

%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
plt.style.use('ggplot')

In [2]:
BRITISH_AIRLINES = "British_Airways"
AMERICAN_AIRLINES = "AmericanAir"

companyUsername = dict()
companyUsername['british'] = BRITISH_AIRLINES
companyUsername["american"] = AMERICAN_AIRLINES

In [3]:
def readCollectedTweets(filename):
    df = pd.read_csv(filename, error_bad_lines=False)
    df.drop(inplace=True, axis=1, labels=["tweet_id",
                                               "username",
                                              "permalink",
                                              "geological_location"])
    return df

def getUsersTweets(companiesDataframes, companyUsername):
    userDataframes = dict()
    
    for companyKey in companiesDataframes:
        temp_df = companiesDataframes[companyKey]
        userDataframes[companyKey] = temp_df[temp_df["user_handle"] != companyUsername[companyKey]]
    
    return userDataframes
    

In [4]:
from nltk.tokenize import sent_tokenize, word_tokenize

def tokenizeTweetSentences(tweetsDict):
    
    tokenizedSentences = dict()
    
    for k in tweetsDict:
        tokenizedSentences[k] = sent_tokenize(tweetsDict[k])
    return tokenizedSentences

def tokenizeTweetWords(tokenizedSentences):
    tokenizedWords = dict()

    for k in tokenizedSentences.keys():
        words = list()
        
        for sentences in tokenizedSentences[k]:
            words.append(sentences.split(" "))
        tokenizedWords[k] = words
    
    return tokenizedWords

def removeStopwords(data, args=None):
    """
        Remove os stops words
        
        Attributes:
            data: String contendo o texto inteiro
            EX: data = "All work and no play makes jack dull boy. 
            All work and no play makes jack a dull boy."
        
        Return:
            Retorna uma lista contendo as palavras do texto filtradas
    
    """
    from nltk.corpus import stopwords
 
#     data = "All work and no play makes jack dull boy. All work and no play makes jack a dull boy."
    stopWords = list(set(stopwords.words('english')))
    
    wordsFiltered = []

    for w in data:
        if w not in stopWords:
            wordsFiltered.append(w)

    return(wordsFiltered)

In [5]:
british_df = readCollectedTweets("british_airlines.csv")
american_df = readCollectedTweets("american_airlines.csv")

# Exclui tweets originarios da propria cia
companiesDataframes = dict()
companiesDataframes["british"] = british_df
companiesDataframes["american"] = american_df


userDataframes = getUsersTweets(companiesDataframes, companyUsername)

b'Skipping line 15: expected 11 fields, saw 13\nSkipping line 73: expected 11 fields, saw 13\nSkipping line 147: expected 11 fields, saw 12\nSkipping line 193: expected 11 fields, saw 12\nSkipping line 251: expected 11 fields, saw 12\nSkipping line 289: expected 11 fields, saw 12\nSkipping line 397: expected 11 fields, saw 12\nSkipping line 401: expected 11 fields, saw 12\nSkipping line 431: expected 11 fields, saw 12\nSkipping line 452: expected 11 fields, saw 12\nSkipping line 564: expected 11 fields, saw 12\nSkipping line 839: expected 11 fields, saw 12\nSkipping line 1060: expected 11 fields, saw 13\nSkipping line 1277: expected 11 fields, saw 12\nSkipping line 1282: expected 11 fields, saw 12\nSkipping line 1331: expected 11 fields, saw 13\nSkipping line 1457: expected 11 fields, saw 12\nSkipping line 1549: expected 11 fields, saw 13\nSkipping line 1658: expected 11 fields, saw 12\nSkipping line 1721: expected 11 fields, saw 12\nSkipping line 1745: expected 11 fields, saw 13\nSkip

## British Airlines

In [6]:
# Recupera o dataframe da British Airlines
britishTweets = userDataframes["british"]

britishTokenizedSentences = tokenizeTweetSentences(dict(britishTweets['text']))
britishTokenizedWords = tokenizeTweetWords(britishTokenizedSentences)

In [7]:
import nltk
def lexical_diversity(text):
    return len(set(text)) / len(text)

lexical_diversity(britishTokenizedSentences[0])

1.0

In [8]:
def filterWords(tokenized, function, args=None):
    filtered = dict()
    for k1 in tokenized:
        filtered[k1] = dict()
        tempList = list()
        
        # k2 é uma lista de strings
        for k2 in tokenized[k1]:
            tempList.append(function(k2, args))
        filtered[k1] = tempList
    return filtered

def removeByRegex(listOfWords, regexString):
    import re
    return [t for t in listOfWords if not re.search(regexString, t)]

In [27]:
stopwordsRemoved = filterWords(britishTokenizedWords, removeStopwords, args=None)
callout_regex = "(@[A-Za-z0-9_]*)|(&[A-Za-z0-9_]*)"
withoutCallout = filterWords(stopwordsRemoved, removeByRegex, args=callout_regex)

In [49]:
posTaggedTuple = nltk.pos_tag(withoutCallout[0][0])
posTaggedTuple
word_tag_pairs = nltk.bigrams(posTaggedTuple)
list(nltk.FreqDist(a[1] for (a, b) in word_tag_pairs if b[1] == 'NN'))

[('Your', 'PRP$'),
 ('crew', 'NN'),
 ('calm,', 'NN'),
 ('professional,', 'NN'),
 ('resourceful', 'JJ'),
 ('-', ':'),
 ('tremendous', 'JJ'),
 ('advert', 'NN'),
 ('you!', 'NN')]

['PRP$', 'NN', 'JJ']

In [51]:
word_tag_fd = nltk.FreqDist(posTaggedTuple)
[word + "/" + tag for (word, tag) in word_tag_fd if tag.startswith('N')]

['crew/NN', 'calm,/NN', 'professional,/NN', 'advert/NN', 'you!/NN']

In [59]:
cfd1 = nltk.ConditionalFreqDist(posTaggedTuple)
cfd1['calm,'].keys()

dict_keys(['NN'])

In [42]:
# import re
# callout_regex = "@[A-Za-z0-9_]+"
# link_regex = 'http[A-Za-z0-9_.-]+'
# [t for t in tokenizedWords[0][0] if not re.search(callout_regex, t)]

## American Airlines

In [None]:
american_df = pd.read_csv("american_airlines.csv")
american_df.drop(inplace=True, axis=1, labels=["tweet_id",
                                               "username",
                                              "permalink",
                                              "geological_location"])
american_df.head()

In [None]:
# Exclui tweets originarios da propria cia
americanUsers_df = american_df[american_df["user_handle"] != "AmericanAir"]


In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize

americanText = dict(americanUsers_df["text"])

tokenizedSentences = dict()
for k in americanText.keys():
    tokenizedSentences[k] = sent_tokenize(americanText[k])


In [None]:
tokenizedWords = dict()
for k in tokenizedSentences.keys():
    words = list()
    for sentences in tokenizedSentences[k]:
        words.append(sentences.split(" "))
    tokenizedWords[k] = words

In [None]:
tokenizedWords[0][0]

In [None]:
import re
callout_regex = "@[A-Za-z0-9_]+"
link_regex = 'http[A-Za-z0-9_.-]+'
[t for t in tokenizedWords[0][0] if not re.search(callout_regex, t)]

In [None]:
removeStopwords(tokenizedWords[0][0])

In [None]:
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

porter = PorterStemmer()
snowball = SnowballStemmer('english')
wordnet = WordNetLemmatizer()

preprocessed_docs = []

for doc in removeStopwords(tokenizedWords[0][0]):
    final_doc = []
    final_doc.append(porter.stem(doc))
        #final_doc.append(snowball.stem(word))
        #final_doc.append(wordnet.lemmatize(word)) #note that lemmatize() can also takes part of speech as an argument!
    preprocessed_docs.append(final_doc)

print (preprocessed_docs)