In [8]:
import numpy as np
import pandas as pd 
from numpy import array
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, classification_report 
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('/home/alaa/Desktop/Data_Analytics/Tweets.csv')

In [3]:
def remove_mentions(input_tweet):
    for i in range(len(input_tweet)):
        input_tweet[i] = re.sub(r'@\w+', '', input_tweet[i])
    return input_tweet

def remove_links(input_tweet):
    for i in range(len(input_tweet)):
        input_tweet[i] = re.sub(r'http\S+', '',input_tweet[i])
    return input_tweet

def remove_stopwords(input_tweet):
    list_of_stopwords = stopwords.words('english')
    #for sentiment analysis some stopwords might give an important indication ,for example: not no, so they should not be removed.
    important_stopwords = ["not", "no"]
    words = input_tweet.split()
    clean_stopwords = [word for word in words if (word not in list_of_stopwords or word in important_stopwords) and len(word) > 1]
    return " ".join(clean_stopwords)

def remove_punctuation(input_tweet):
    for i in range(len(input_tweet)):
        input_tweet[i] = re.sub(r'[^\w\s]','',input_tweet[i])
    return input_tweet

def lower_case(input_tweet):
    for i in range(len(input_tweet)):
        input_tweet[i] = input_tweet[i].lower()
    return input_tweet

In [4]:
data_new = data[['text', 'airline_sentiment']]
preprocessed_data = data_new.apply(remove_mentions).apply(remove_links).apply(remove_punctuation).apply(lower_case)
cleaned_tweets = []
for tweets in preprocessed_data.text:
    clean_stopwords = remove_stopwords(tweets)
    cleaned_tweets.append(clean_stopwords)

X = cleaned_tweets
Y = preprocessed_data['airline_sentiment']

Y = Y.map({'negative':0, 'positive':1, 'neutral':2}).astype(int)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.15, random_state=20)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.18, random_state=20)

In [7]:
"""there are many ways of representing words by numbers, one of them is tf-idf, it gives better results than than 
one hot encoding. Tf-idf stands for term frequency, inverse document frequency. and, apparently, it's obvious that the 
words which appear more often should have a greater weight in textual data analysis."""

def tf_idf(X_train,X_val,X_test):
  vectorizer = TfidfVectorizer(ngram_range=(1, 2))
  train_features = vectorizer.fit_transform(X_train) 
  val_features = vectorizer.transform(X_val)
  test_features = vectorizer.transform(X_test)
  return (train_features,val_features,test_features)

In [9]:
def results(labels, pred):
    print(confusion_matrix(labels, pred))
    print(classification_report(labels, pred))


def svm(training_features, labels_train, test_features, labels_test):
    model = LinearSVC(C=1)
    model.fit(training_features, labels_train)
    print("Accuracy of SVM : %s" % (accuracy_score(labels_test, model.predict(test_features))))
    results(labels_test, model.predict(test_features))

In [10]:
train_features,val_features,test_features=tf_idf(X_train,X_val,X_test)

In [11]:
svm(train_features,y_train, val_features,y_val)

Accuracy of SVM : 0.7977678571428571
[[1311   22   58]
 [  73  258   32]
 [ 218   50  218]]
              precision    recall  f1-score   support

           0       0.82      0.94      0.88      1391
           1       0.78      0.71      0.74       363
           2       0.71      0.45      0.55       486

    accuracy                           0.80      2240
   macro avg       0.77      0.70      0.72      2240
weighted avg       0.79      0.80      0.78      2240



In [13]:
svm(train_features,y_train, test_features,y_test)

Accuracy of SVM : 0.7987249544626593
[[1324   18   52]
 [  84  239   39]
 [ 207   42  191]]
              precision    recall  f1-score   support

           0       0.82      0.95      0.88      1394
           1       0.80      0.66      0.72       362
           2       0.68      0.43      0.53       440

    accuracy                           0.80      2196
   macro avg       0.77      0.68      0.71      2196
weighted avg       0.79      0.80      0.78      2196

