In [1]:
import tweepy

#Setting the credentials for the connection with twitter.
auth = tweepy.OAuthHandler('xxxx', 'xxxx')
auth.set_access_token('xxxx', 'xxxx')

#connecting to the twitter api.
api = tweepy.API(auth)

#importing the dataset with pandas.
import pandas as pd
data = pd.read_csv('retweets.csv',low_memory = False)

In [2]:
#filtering for english language tweets only.
data = data[data.lang == 'en']

#dropping duplicate rows.
data.drop_duplicates(subset = "origTweetId",inplace = True) 

#creating the empty lists to append the tweets downloaded from twitter and their corresponding ids.
lst2 = []
tweetid = []

#Loop for extracting the tweets with wait on rate to true to handle the rate limits of the twitter api.
#Setting try catch in order to bypass any exeptions caused by inactive twitter accounts or deleted tweets
#and appending NA in these cases.
for row in data.itertuples():
        try:
            tweet = api.get_status(row.origTweetId,wait_on_rate_limit=True)
            lst2.append(tweet.text)
            tweetid.append(row.origTweetId)
            print(tweet.text)
        except StopIteration:
            break
        except:
            lst2.append('NA')
            tweetid.append(row.origTweetId)
        continue
    


Euro Parliament votes overwhelmingly to reject #ACTA but the Commission say they still intend to press ahead. Another reason to vote @UKIP
US have one currency, one Central Bank and one Govt. Europe has one currency, one Central Bank and...17 govts! Cannot go on like this.
More jobs for young people is my top priority for the EU @socialdemocraterna @maritaulvskog  #EP2014 in Umea Sweden http://t.co/3O6ZM5J8Nt
Vote #Labour, get Tory. Vote Tory, get Labour. Vote #UKIP get UKIP  http://t.co/buStolPbTT
Answering my question in ECR Group J-C Juncker has confirmed that the post of Chief Scientific Advisor will be continued in his presidency.
Robert Halfon raising issue of illegal traveller sites - if UKIP had done it he'd have called us Nazis http://t.co/rBdwA6OYx8 #opportunist
Ignore the Labour lies, this is where #UKIP stands on the #NHS http://t.co/a5cZurjf9n
Next week I'll be grilling EU Commissioner candidates about  the environment. What would you ask? #AskTeamJuncker http://t.co/yspLO

In [31]:
#Creating a pandas dataframe for the tweets and the tweets ids.
df = pd.DataFrame(tweetid)

#renaming the column that will be used for the join with the main dataframe.
df = df.rename(columns={0: "origTweetId"}) 

df["tweet"] = lst2

# joining the downloaded tweets with the dataset on the origTweetid column.
datam = data.merge(df,how='left', on="origTweetId") 

# dropping NAs
datam.dropna(axis = 0, inplace = True) 

#Converting everything to lower case.
datam.tweet = datam.tweet.str.lower() 

#removing punctuation and links for better tokenization of each tweet's text.

datam.tweet.replace(to_replace ="#", value ="", inplace = True,regex=True)
datam.tweet.replace(to_replace ="http.*?(\s|$)", value ="", inplace = True,regex=True)
datam.tweet.replace(to_replace = "-", value ="", inplace = True,regex=True)
datam.tweet.replace(to_replace = "'", value ="", inplace = True,regex=True)
datam.tweet.replace(to_replace = ",", value ="", inplace = True,regex=True)
datam.tweet.replace(to_replace = ":", value ="", inplace = True,regex=True)
datam.tweet.replace(to_replace = "@", value ="", inplace = True,regex=True)
datam.tweet.replace(to_replace = "_", value ="", inplace = True,regex=True)
datam.tweet.replace(to_replace = '"', value ="", inplace = True,regex=True)
datam.tweet.replace(to_replace = '!', value ="", inplace = True,regex=True)
datam.tweet.replace(to_replace = '&', value ="", inplace = True,regex=True)
datam.tweet.replace(to_replace = '\?', value ="", inplace = True,regex=True)
datam.tweet.replace(to_replace = '\.', value ="", inplace = True,regex=True)
datam.tweet.replace(to_replace = ';', value ="", inplace = True,regex=True)
datam.tweet.replace(to_replace = '\(', value ="", inplace = True,regex=True)
datam.tweet.replace(to_replace = '\)', value ="", inplace = True,regex=True)

#Checking for MEP groups that have less than 50 tweets and dropping them from the dataframe.
datam.groupby("origMepGroupShort")["tweet"].count()
datam= datam[datam.origMepGroupShort != "NI"]
datam= datam[datam.origMepGroupShort != "ENL"]

In [36]:
#importing module to split to train and test dataset.
from sklearn.model_selection import train_test_split 

y = datam.origMepGroupShort

X_train, X_test, y_train, y_test = train_test_split(datam.tweet, y, test_size=0.3, shuffle = True)

import numpy as np

In [41]:
#importing the required modules.
from sklearn import metrics 
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline 

#Building a pipeline to remove english stop words, convert everything to lowercase and strip any accents from the tweets.
#Also adding the TfidfTransformer for the addition of tfidf transformation to the pipeline and using
#the Multinomial Naive Bayes classifier.
text_clf = Pipeline([("vect",CountVectorizer(stop_words = 'english', lowercase = True,strip_accents = "ascii")),
                     ("tfidf",TfidfTransformer()),
                     ("clf",MultinomialNB()),])

#Setting the parameters for the crossvalidation procedure. In this case we will examine the use of bigrams(ngram range) or not, 
#the use of tfidf transformation or not and the alpha values which is the tuning parameter of the Naive Bayes Classifier.
parameters = {"vect__ngram_range":[(1,1),(1,2)],
              "tfidf__use_idf":(True,False),
              "clf__alpha": (0.01, 0.001,1), 
              }

#Setting the n_jobs, numbers of cores for parallelization to 7 in order to keep 1 core for os tasks, setting the used metric
#to accuracy and refitting according to that metric.
gs_clf = GridSearchCV(text_clf,parameters,n_jobs= 7, cv =5,scoring='accuracy',refit='accuracy',return_train_score=True)

#Fitting the tuned model to the training dataset.
gs_clf = gs_clf.fit(X_train,y_train)

#Checking the best parameters that were picked from the cross validation for maximum accuracy.
gs_clf.best_estimator_

#performing predictions on the test dataset.
predicted = gs_clf.predict(X_test)

#Calculating the accuracy manually.
accuracy_NB = np.mean(predicted == y_test)

#More metrics on each classifier are summarized at the end of the Notebook.
print("The accuracy of the tuned Naive Bayes Classifier is" + round(accuracy_NB,2).astype(str))

The accuracy of the tuned Naive Bayes Classifier is 0.62


In [62]:
from sklearn.ensemble import RandomForestClassifier

text_clf_forest = Pipeline([("vect2",CountVectorizer(stop_words = 'english', lowercase = True,strip_accents = "ascii")),
                     ("tfidf",TfidfTransformer()),
                     ("rf",RandomForestClassifier()),])

#assigning 5 test values for the max depth of each tree parameter and appending the none option to include it in the grid search 
#of the random forest parameters.
max_depth = [int(x) for x in np.linspace(10, 110, num = 5)]
max_depth.append(None)
    
parameters = {"vect2__ngram_range":[(1,1),(1,2)],
              "tfidf__use_idf":(True,False),
              "rf__n_estimators": (100,250,500),  #Number of trees that will be created.
              "rf__max_features": ('auto', 'sqrt', 'log2'), #The number of max features for each tree.
              "rf__max_depth": max_depth,
              "rf__criterion": ('gini', 'entropy'), #splitting criterion.
              }

gs_forest = GridSearchCV(text_clf_forest,parameters,n_jobs= 7, cv =5,scoring= 'accuracy',refit='accuracy',
                         return_train_score=True)

#fitting the model to the train data
gs_forest = gs_forest.fit(X_train,y_train)

#Checking the best parameters that were picked from the cross validation for maximum accuracy.
gs_forest.best_estimator_

#performing predictions on the test dataset.
predicted2 = gs_forest.predict(X_test)

#Calculating the accuracy manually.
accuracy_RF = np.mean(predicted2 == y_test)

#More metrics on each classifier are summarized at the end of the Notebook.
print("The accuracy of the tuned Random Forest Classifier is " + round(accuracy_RF,2).astype(str))

The accuracy of the tuned Random Forest Classifier is 0.61


In [53]:
from sklearn import svm                        

text_clf_svm = Pipeline([("vect3",CountVectorizer(stop_words = 'english', lowercase = True,strip_accents = "ascii")),
                     ("tfidf",TfidfTransformer()),
                     ("svm",svm.SVC()),])
    
parameters = {"vect3__ngram_range":[(1,1),(1,2)],
              "tfidf__use_idf":(True,False),
              "svm__C": (0.001, 0.01, 0.1, 1), 
              "svm__kernel": ('linear', 'poly', 'rbf'),
              "svm__gamma": (0.001, 0.01, 0.1, 1),
              }
gs_svm = GridSearchCV(text_clf_svm,parameters,n_jobs= 7, cv =5,scoring='accuracy',refit='accuracy',return_train_score=True)

#fitting the model to the train data.
gs_svm = gs_svm.fit(X_train,y_train)

#Checking the best parameters that were picked from the cross validation for maximum accuracy.
gs_svm.best_estimator_

#performing predictions on the test dataset.
predicted3 = gs_svm.predict(X_test)

#Calculating the accuracy manually.
accuracy_svm = np.mean(predicted3 == y_test)

#More metrics on each classifier are summarized at the end of the Notebook.
print("The accuracy of the tuned Support Vector Machine Classifier is " + round(accuracy_svm,2).astype(str))

The accuracy of the tuned Support Vector Machine Classifier is 0.63


In [64]:
from sklearn import metrics
from sklearn.dummy import DummyClassifier

text_clf_dummy = Pipeline([("vect4",CountVectorizer(stop_words = 'english', lowercase = True,strip_accents = "ascii")),
                     ("tfidf",TfidfTransformer()),
                     ("dummy",DummyClassifier()),])

parameters = {"vect4__ngram_range":[(1,1),(1,2)],
              "tfidf__use_idf":(True,False),
              "dummy__strategy":('stratified', 'most_frequent', 'prior', 'uniform'),
              }

gs_dummy = GridSearchCV(text_clf_dummy,parameters,n_jobs= 7, cv =5,scoring='accuracy',refit='accuracy',return_train_score=True)

#fitting the model to the train data.
gs_dummy = gs_dummy.fit(X_train,y_train)

#Checking the best parameters that were picked from the cross validation for maximum accuracy.
gs_dummy.best_estimator_

#performing predictions on the test dataset.
predicted4 = gs_dummy.predict(X_test)

#Calculating the accuracy manually.
accuracy_dummy = np.mean(predicted4 == y_test)

#More metrics on each classifier are summarized at the end of the Notebook.
print("The accuracy of the tuned Dummy Classifier is " + round(accuracy_dummy,2).astype(str))

The accuracy of the tuned Dummy Classifier is 0.26


In [86]:
from sklearn.neighbors import KNeighborsClassifier

text_clf_knn = Pipeline([("vect5",CountVectorizer(stop_words = 'english', lowercase = True,strip_accents = "ascii")),
                     ("tfidf",TfidfTransformer()),
                     ("knn",KNeighborsClassifier()),])

neighbors = np.arange(3,33)
parameters = {"vect5__ngram_range":[(1,1),(1,2)],
              "tfidf__use_idf":(True,False),
              "knn__n_neighbors":(neighbors),
              }

gs_knn = GridSearchCV(text_clf_knn,parameters,n_jobs= 7, cv =5,scoring='accuracy',refit='accuracy',return_train_score=True)

#fitting the model to the train data.
gs_knn = gs_knn.fit(X_train,y_train)

#Checking the best parameters that were picked from the cross validation for maximum accuracy.
gs_knn.best_estimator_

#performing predictions on the test dataset.
predicted5 = gs_knn.predict(X_test)

#Calculating the accuracy manually.
accuracy_knn = np.mean(predicted5 == y_test)

#More metrics on each classifier are summarized at the end of the Notebook.
print("The accuracy of the tuned K-Nearest Neighbors Classifier is " + round(accuracy_knn,2).astype(str))

The accuracy of the tuned K-Nearest Neighbors Classifier is 0.56


In [85]:
#As we can see in the below tables all metrics agree that the Support Vector Machine Classifier is the most accurate algorithm
#amongst Random Forest, Naive Bayes, knn and the dummy classifier for the specific tweets dataset.
#Upon closer inspection we also see that all methods that were tested had much better performance than the dummy classifier.

print("Naive Bayes" +"\n" + metrics.classification_report(y_test,predicted) )
print("Random Forest" +"\n" + metrics.classification_report(y_test,predicted2) )
print("Support Vector Machine" +"\n" + metrics.classification_report(y_test,predicted3) )
print("Dummy Classifier" +"\n" + metrics.classification_report(y_test,predicted4) )
print("K-Nearest Neighbors" +"\n" + metrics.classification_report(y_test,predicted5) )

Naive Bayes Classifier
              precision    recall  f1-score   support

        ALDE       0.53      0.54      0.53       438
         ECR       0.57      0.28      0.38       318
        EFDD       0.68      0.88      0.76       845
         EPP       0.63      0.66      0.64       497
     GUE-NGL       0.78      0.32      0.45        79
  Greens-EFA       0.57      0.32      0.41       288
         S&D       0.61      0.65      0.63       779

    accuracy                           0.62      3244
   macro avg       0.62      0.52      0.54      3244
weighted avg       0.62      0.62      0.61      3244

Random Forest Classifier
              precision    recall  f1-score   support

        ALDE       0.61      0.42      0.50       438
         ECR       0.84      0.23      0.36       318
        EFDD       0.55      0.95      0.70       845
         EPP       0.69      0.64      0.66       497
     GUE-NGL       0.77      0.30      0.44        79
  Greens-EFA       0.63      0