In [27]:
import numpy as np 
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
import hunspell
spellchecker = hunspell.HunSpell('/usr/share/hunspell/en_US.dic',
                            '/usr/share/hunspell/en_US.aff')
from os import listdir
from os.path import isfile, join

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
with open("lexicons/positive-words.txt") as file:
    positiveList = set(file.read().splitlines())
    
with open("lexicons/negative-words.txt") as file:
    negativeList = set(file.read().splitlines())

In [4]:
class PreProcessing:
    def __init__(self, df, filepath):
        self.filepath = filepath
        self.df = df
        self.df.drop(labels=['username',
                      'user_handle',
                      'date',
                      'retweets',
                      'favorites',
                      'geological_location',
                      'mentions',
                      'hashtags',
                      'tweet_id',
                      'permalink',
                      'col1', 'col2', 'col3'], axis=1, inplace=True)

    # feature 3
    def countPositiveCapitalized(self, tokens):
        """
        Calculates the number of positive words that are capitalized
        
        @params:
            tokens: The non stopwords list
        """
        ######print('countPositiveCapitalized()')
        counter = 0
        tokensSplit = tokens.split()
        for t in tokensSplit:
            if t[0].isupper() == True and t in positiveList:
                counter += 1
        return counter

    # feature 4
    def countNegativeCapitalized(self, tokens):
        """
        Calculates the number of negative words that are capitalized
        
        @params:
            tokens: The non stopwords list
        """
        #####print('countNegativeCapitalized()')
        counter = 0
        tokensSplit = tokens.split()
        for t in tokensSplit:
            if t[0].isupper() == True and t in negativeList:
                counter += 1
        return counter

    # feature 5
    def hasCapitalized(self, tokens):
        """
        Check if the tweet has capitalized words
        
        @params:
            tokens: The non stopwords list
        """
        #####print('hasCapitalized()')
        counter = 0
        tokensSplit = tokens.split()
        for t in tokensSplit:
            if t[0].isupper() == True:
                return 1
        return 0

    # feature 6
    def countHashtags(self, tokens):
        """
        Count the number of words that starts with # (hashtags)
        
        @params:
            tokens: The non stopwords list
        """

        #####print('countHashtags()')
        counter = 0
        for t in tokens:
            if t.startswith("#"):
                counter += 1
        return counter

    # feature 7
    def countPositive(self, tokens):
        """
        Calculates the number of words that are in the positive words list
        
        @params:
            tokens: The non stopwords list
        """

        #####print('countPositive()')
        counter = 0
        tokensSplit = tokens.split()
        for t in tokensSplit:
            if t.lower() in positiveList:
                counter += 1
        return counter

    # feature 8
    def countNegative(self, tokens):
        """
        Calculates the number of words that are in the negative words list
        
        @params:
            tokens: The non stopwords list
        """

        #####print('countNegative()')
        counter = 0
        tokensSplit = tokens.split()
        for t in tokensSplit:
            if t.lower() in negativeList:
                counter += 1
        return counter

    # feature 9
    def countNeutral(self, tokens):
        """
        Calculates the number of words that are in the neutral words list
        
        @params:
            tokens: The non stopwords list
        """

        #####print('countNeutral()')
        counter = 0
        tokensSplit = tokens.split()
        for t in tokensSplit:
            if t.lower() not in negativeList and t.lower() not in positiveList:
                counter += 1
        return counter

    # feature 10
    def countCapitalizedWords(self, tokens):
        """
        Calculates the number of words that are capitalized
        
        @params:
            tokens: The non stopwords list
        """

        #####print('countCapitalizedWords()')
        counter = 0
        tokensSplit = tokens.split()
        for t in tokensSplit:
            if t.isupper() and len(t) > 1:
                counter += 1
        return counter

    # feature 11
    def countSpecialCharacters(self, tokens):
        """
        Calculates the number of occurrencies of all special character
        
        @params:
            tokens: The non stopwords list
        """

        #####print('countSpecialCharacters()')
        counter = 0
        tokensSplit = tokens.split()
        for t in tokensSplit:
            if not re.match("^[a-zA-Z0-9_]*$", t):
                counter += 1
        return counter

    def countSpecificSpecialCharacter(self, specialCharacter, tokens):
        """
        Calculates the number of occurrencies of a specific special character
        
        @params:
            tokens: The non stopwords list
        """

        #####print('countSpecificSpecialCharacter()')
        counter = 0
        tokensSplit = tokens.split()
        for t in tokensSplit:
            if t == specialCharacter:
                counter += 1
        return counter

    def fixSpelling(self, tokens):
        #####print('fixSpelling()')

                            
        words = tokens.split()              
        newWords = ""#list()
        for w in words:
            if not spellchecker.spell(w):
    #             newWords.append(spellchecker.suggest(w)[0])
                try:
                    newWords += " " + spellchecker.suggest(w)[0]
                except(IndexError):
                    newWords += " " + ""
            else:
    #             newWords.append(w)
                newWords += " " + w
        return newWords
            
    def stemming(self, tokens):
        '''
        Apply stemming to each token
        '''
        
        stemmer = SnowballStemmer("english")  
        stemmed = [stemmer.stem(w) for w in tokens.split()]
        return stemmed

    def tweet2words(self, raw_tweet):
        """
        Split the tweet string into words list and remove stopwords
            
        @params:
            raw_tweet: the tweet string collectd
        """
        callout_regex = "@[A-Za-z0-9_]+"
        
        #Remove mencoes a perfis
        letters_only = re.sub(callout_regex, " ", raw_tweet)
        letters_only = re.sub("[^a-zA-Z]", " ", letters_only)
        
        words = letters_only.lower().split()                             
        words = letters_only.split()                             
        stops = set(stopwords.words("english"))                  
        meaningful_words = [w for w in words if not w in stops] 
        return( " ".join( meaningful_words )) 

    def clean_tweet_length(self, raw_tweet):
        """
        Calculates the number of non stopwords
        
        @params:
            raw_tweet: the tweet string collectd
        """
        letters_only = re.sub("[^a-zA-Z]", " ",raw_tweet) 
        words = letters_only.lower().split()                             
        stops = set(stopwords.words("english"))                  
        meaningful_words = [w for w in words if not w in stops] 
        return(len(meaningful_words)) 

    def preprocessing(self):
        self.df["text"] = self.df["text"].values.astype("U")
        
        self.df['tweet2words'] = self.df['text'].apply(self.tweet2words)

        self.df["num_capitalized"] = self.df["tweet2words"].apply(self.countCapitalizedWords)

        self.df['tweet_length'] = self.df['text'].apply(self.clean_tweet_length)

        self.df["num_negative_words"] = self.df["tweet2words"].apply(self.countNegative)
        # Number of occurrencies
        self.df["num_positive_words"] = self.df['text'].apply(self.countPositive)
        self.df["num_negative_words"] = self.df['text'].apply(self.countNegative)
        self.df["num_neutral_words"] = self.df['text'].apply(self.countNeutral)

        # Capitalized words
        self.df["has_capitalized"] = self.df['text'].apply(self.hasCapitalized)
        self.df["num_capitalised_positive_words"] = self.df['text'].apply(self.countPositiveCapitalized)
        self.df["num_capitalised_negative_words"] = self.df['text'].apply(self.countNegativeCapitalized)


        self.df["num_hashtags"] = self.df['text'].apply(self.countHashtags)
        self.df["num_special_character"] = self.df['text'].apply(self.countSpecialCharacters)
        self.df['correctedText'] =  self.df['tweet2words'].apply(self.fixSpelling)

    def exportDataframe(self):
        self.df.to_csv(self.filepath + '_preprocessed.csv', index=False)

In [11]:
def countPositiveCapitalized(tokens):
    """
    Calculates the number of positive words that are capitalized

    @params:
        tokens: The non stopwords list
    """
    ######print('countPositiveCapitalized()')
    counter = 0
    tokensSplit = tokens.split()
    for t in tokensSplit:
        if t[0].isupper() == True and t in positiveList:
            counter += 1
    return counter

# feature 4
def countNegativeCapitalized(tokens):
    """
    Calculates the number of negative words that are capitalized

    @params:
        tokens: The non stopwords list
    """
    #####print('countNegativeCapitalized()')
    counter = 0
    tokensSplit = tokens.split()
    for t in tokensSplit:
        if t[0].isupper() == True and t in negativeList:
            counter += 1
    return counter

# feature 5
def hasCapitalized(tokens):
    """
    Check if the tweet has capitalized words

    @params:
        tokens: The non stopwords list
    """
    #####print('hasCapitalized()')
    counter = 0
    tokensSplit = tokens.split()
    for t in tokensSplit:
        if t[0].isupper() == True:
            return 1
    return 0

# feature 6
def countHashtags(tokens):
    """
    Count the number of words that starts with # (hashtags)

    @params:
        tokens: The non stopwords list
    """

    #####print('countHashtags()')
    counter = 0
    for t in tokens:
        if t.startswith("#"):
            counter += 1
    return counter

# feature 7
def countPositive(tokens):
    """
    Calculates the number of words that are in the positive words list

    @params:
        tokens: The non stopwords list
    """

    #####print('countPositive()')
    counter = 0
    tokensSplit = tokens.split()
    for t in tokensSplitit:
        if t.lower() in positiveList:
            counter += 1
    return counter

# feature 8
def countNegative(tokens):
    """
    Calculates the number of words that are in the negative words list

    @params:
        tokens: The non stopwords list
    """

    #####print('countNegative()')
    counter = 0
    tokensSplit = tokens.split()
    for t in tokensSplit:
        if t.lower() in negativeList:
            counter += 1
    return counter

# feature 9
def countNeutral(tokens):
    """
    Calculates the number of words that are in the neutral words list

    @params:
        tokens: The non stopwords list
    """

    #####print('countNeutral()')
    counter = 0
    tokensSplit = tokens.split()
    for t in tokensSplit:
        if t.lower() not in negativeList and t.lower() not in positiveList:
            counter += 1
    return counter

# feature 10
def countCapitalizedWords(tokens):
    """
    Calculates the number of words that are capitalized

    @params:
        tokens: The non stopwords list
    """

    #####print('countCapitalizedWords()')
    counter = 0
    tokensSplit = tokens.split()
    for t in tokensSplit:
        if t.isupper() and len(t) > 1:
            counter += 1
    return counter

# feature 11
def countSpecialCharacters(tokens):
    """
    Calculates the number of occurrencies of all special character

    @params:
        tokens: The non stopwords list
    """

    #####print('countSpecialCharacters()')
    counter = 0
    tokensSplit = tokens.split()
    for t in tokensSplit:
        if not re.match("^[a-zA-Z0-9_]*$", t):
            counter += 1
    return counter

def countSpecificSpecialCharacter(specialCharacter, tokens):
    """
    Calculates the number of occurrencies of a specific special character

    @params:
        tokens: The non stopwords list
    """

    #####print('countSpecificSpecialCharacter()')
    counter = 0
    tokensSplit = tokens.split()
    for t in tokensSplit:
        if t == specialCharacter:
            counter += 1
    return counter

def fixSpelling(tokens):
    #####print('fixSpelling()')


    words = tokens.split()              
    newWords = ""#list()
    for w in words:
        if not spellchecker.spell(w):
#             newWords.append(spellchecker.suggest(w)[0])
            try:
                newWords += " " + spellchecker.suggest(w)[0]
            except(IndexError):
                newWords += " " + ""
        else:
#             newWords.append(w)
            newWords += " " + w
    return newWords

def stemming(tokens):
    '''
    Apply stemming to each token
    '''

    stemmer = SnowballStemmer("english")  
    stemmed = [stemmer.stem(w) for w in tokens.split()]
    return stemmed

def tweet2words(raw_tweet):
    """
    Split the tweet string into words list and remove stopwords

    @params:
        raw_tweet: the tweet string collectd
    """
    callout_regex = "@[A-Za-z0-9_]+"

    #Remove mencoes a perfis
    letters_only = re.sub(callout_regex, " ", raw_tweet)
    letters_only = re.sub("[^a-zA-Z]", " ", letters_only)

    words = letters_only.lower().split()                             
    words = letters_only.split()                             
    stops = set(stopwords.words("english"))                  
    meaningful_words = [w for w in words if not w in stops] 
    return( " ".join( meaningful_words )) 

def clean_tweet_length(raw_tweet):
    """
    Calculates the number of non stopwords

    @params:
        raw_tweet: the tweet string collectd
    """
    letters_only = re.sub("[^a-zA-Z]", " ",raw_tweet) 
    words = letters_only.lower().split()                             
    stops = set(stopwords.words("english"))                  
    meaningful_words = [w for w in words if not w in stops] 
    return(len(meaningful_words)) 

In [8]:
tweetsPath = '02_prediction_tweets'
onlyfiles = [f for f in listdir(tweetsPath) if isfile(join(tweetsPath, f))]
path = onlyfiles[3]
# for path in onlyfiles:

print("Processing " + path)
    # print('prediction_tweets/' + path)
df = pd.read_csv('prediction_tweets/' + path, error_bad_lines=False)
#     preproc = PreProcessing(df, 'prediction_tweets/' + path.split('.')[0])
#     preproc.preprocessing()
#     preproc.exportDataframe()

Processing british_airlines.csv


In [40]:
df = pd.read_csv('prediction_tweets/' + path, error_bad_lines=False)
df["text"] = df["text"].values.astype("U")        
df['tweet2words'] = df['text'].apply(tweet2words)
df["num_capitalized"] = df["tweet2words"].apply(countCapitalizedWords)
df['tweet_length'] = df['text'].apply(clean_tweet_length)
df["num_negative_words"] = df["tweet2words"].apply(countNegative)
df["num_positive_words"] = df['text'].apply(countPositive)
df["num_negative_words"] = df['text'].apply(countNegative)
df["num_neutral_words"] = df['text'].apply(countNeutral)
df["has_capitalized"] = df['text'].apply(hasCapitalized)
df["num_capitalised_positive_words"] = df['text'].apply(countPositiveCapitalized)
df["num_capitalised_negative_words"] = df['text'].apply(countNegativeCapitalized)
df["num_hashtags"] = df['text'].apply(countHashtags)
df["num_special_character"] = df['text'].apply(countSpecialCharacters)
df['correctedText'] =  df['tweet2words'].apply(fixSpelling)

df.head(2)

Unnamed: 0,username,user_handle,date,retweets,favorites,text,geological_location,mentions,hashtags,tweet_id,...,tweet_length,num_negative_words,num_positive_words,num_neutral_words,has_capitalized,num_capitalised_positive_words,num_capitalised_negative_words,num_hashtags,num_special_character,correctedText
0,Frank Gardner,FrankRGardner,2018-08-06 18:59,2,36,"Your crew were calm, professional, resourceful...",0.0,,,,...,6,0,1,11,1,0,0,0,4,Your crew calm professional resourceful treme...
1,Frank Gardner,FrankRGardner,2018-08-06 18:41,101,1092,Big thank-u to the lovely crew of @British_Air...,0.0,,@British_Airways @HeathrowAirport,,...,17,1,2,24,1,0,0,0,5,Big thank u lovely crew flight BA today found...


In [43]:
df.to_csv(path+'_preprocessed.csv', index=False)

In [44]:
SAVE_MODEL_TO_DISK = 0
LOAD_MODEL = 1

# GENERAL LIBS
import numpy as np 
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
%matplotlib inline

# SKLEARN
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.pipeline import FeatureUnion
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB

#NLTK
from nltk.stem.snowball import SnowballStemmer

# GENSIM
from gensim.sklearn_api import W2VTransformer
from gensim.models import Word2Vec

import warnings
warnings.filterwarnings('ignore')

In [51]:
class TextSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on text columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def splitString(self, s):
        try:
            return s.split()
        except AttributeError:
            return ""
            
    
    def transform(self, X):
        # Apply the word2vec transformation
        a = X[self.key]
#         return wordvecs.fit_transform(a)
        return a

class Senteces(object):
    def __init__(self, dirname):
        self.dirname = dirname
 
    def __iter__(self):
        for fname in os.listdir(self.dirname):
            for line in open(os.path.join(self.dirname, fname)):
                yield line.split()

class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]

In [52]:
import pickle
from os import listdir
from os.path import isfile, join
from sklearn.externals import joblib

modelsPath = 'models/'
onlyfiles = [f for f in listdir(modelsPath) if isfile(join(modelsPath, f))]
onlyfiles

['SVC.joblib',
 'MLPClassifier.joblib',
 'DecisionTreeClassifier.joblib',
 'MultinomialNB.joblib']

In [53]:
bestModels = list()
for file in onlyfiles:
    bestModels.append(joblib.load(modelsPath + file))

In [66]:
tweetsPath = '03_processed/'
onlyfiles = [f for f in listdir(tweetsPath) if isfile(join(tweetsPath, f))]
# model = bestModels[0]
predictions = list()
for model in bestModels: 
    for file in onlyfiles:
        newTweets = pd.read_csv(tweetsPath + file)
        newTweets.drop(labels=['text', 'tweet2words'], axis = 1, inplace=True)
        newTweets['correctedText'] = newTweets["correctedText"].values.astype("U")
        newTweets['sentiment'] = model.predict(newTweets)
        predictions.append(newTweets)
        classifierName = str(model.get_params('classifier')['classifier']).split('(')[0]
        newTweets.to_csv('04_output/' + classifierName + '/' + file.split('.')[0] + '_prediction.csv')

In [70]:
tweetsDF = pd.read_csv("preprocessed2.csv")

tweetsDF.drop(labels=["Unnamed: 0",
                      "airline", 
                      "negativereason", 
                      "airline_sentiment_confidence", 
                      "negativereason",
                      "negativereason_confidence",
                      "airline_sentiment",
                      "text"], axis=1, inplace=True)

tweetsDF["tweet2words"] = tweetsDF["tweet2words"].values.astype("U")
tweetsDF["correctedText"] = tweetsDF["correctedText"].values.astype("U")
tweetsDF.drop(labels=['tweet2words'], axis=1, inplace=True)

# tweetsDF["correctedText"] = tweetsDF["correctedText"].apply(stemming)

In [71]:
target = "sentiment"
features = [c for c in tweetsDF.columns.values if c not in [target]]
numeric_features =  [c for c in tweetsDF.columns.values if c not in ['tweet2words', 'correctedText', target]]

X_train = tweetsDF[features]
Y_train = tweetsDF[target]

In [72]:
from sklearn.metrics import confusion_matrix

# bestModel = clf[0]['estimator'][0]
allMeasures = dict()
for i in range(len(bestModels)):
    model = bestModels[i]
    tn, fp, fn, tp = confusion_matrix(model.predict(X_train), Y_train).ravel()

    measures = dict()
    measures["acc"] = (tp + tn)/(tn + fp + fn + tp) * 100
    prec = tp/(tp + fp) * 100
    recall = tp/(tp + fn) * 100
    measures["prec"] = prec
    measures["recall_sens"] = recall
    measures["f1_score"] = (2 * prec * recall/(prec + recall))
    measures['miss_rate'] = (fp + fn) / float(tp + tn + fp + fn) * 100
    measures['spec'] = tn/float(tn + fp) * 100
    measures['fp_rate'] = fp/float(tn + fp) * 100
    allMeasures[i] = measures

In [89]:
performance = pd.DataFrame.from_dict(data=allMeasures, orient='index')
performance.set_axis(labels=['SVC', 'DecisionTreeClassifier', 'MLPClassifier', 'MultinomialNB'], inplace=True)
# performance.set_axis(labels=['NB'], inplace=True)
performance.to_excel("performances.xls")

In [84]:
tweetsDF = pd.read_csv("Tweets.csv")

In [85]:
from sklearn.linear_model import SGDClassifier

In [99]:
total = 0
for d in predictions[:4]:
    print(len(d))

5370
14168
5903
8243
