# Reasons for negative comments about airlines

The exercise is concerned with a dataset of tweets that mention several US airline companies. Each tweet was manually labelled as expressing negative sentiment. In addition, each tweet was marked up for the reason for the negative sentiment, with labels such as "Customer Service Issue", "Late Flight", "Lost Luggage".

The task is to train and evaluate an SVM classifier of the reasons for negative sentiment. A part of the solution is provided below, you need to supply your code in places indicated with "???"

In the training stage, you should try several values for `C`. Find an optimal setting for it.

Provide comments on the learning curve and confusion matrix plots. Suggest ways to improve the accuracy of the classifier.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

sns.set_theme(palette="Set2")

# Load the data

Load the dataset from the URL, select only the "text" and "negativereason" columns, and drop any rows with missing values.

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/chawit/data-Airline/master/TweetsAirline.csv",
                usecols=["text", "negativereason"])
df.head()

In [None]:
# drop rows with NaN
df.dropna(inplace=True)

In [None]:
df.shape

In [None]:
df["negativereason"].value_counts()

# Data cleaning and transformation

In [None]:
# split the data into the training and test parts
# the test part should be 0.2 of the whole dataset

???

In [None]:
print(f"{len(docs_train)} train and {len(docs_test)} test instances")

In [None]:
import re

from sklearn.feature_extraction.text import CountVectorizer


def custom_preprocessor(text):
    """Remove some elements of a tweet (URLs, hashtags, Twitter handles,
    any other non-words) using regular expressions
    """
    # RE for a URL
    urls = re.compile(r'https?:\/\/(www\.)?[a-z]+\.(com|net|org|ly)\S+')
    
    # RE for a hashtag, e.g. #MachineLearning #DeepLearning
    hashtags = ???
    
    # RE for a Twitter handle, e.g., @scikit_learn, @ogrisel
    handles = ???
    
    text = urls.sub('', text)
    text = hashtags.sub('', text)
    text = handles.sub('', text)
    
    # bonus question: remove any sequences of two or more characters that are not
    # letters, digits or spaces, e.g. ":)",  "!!!", "<<<", etc.
    nonwords = re.compile(???)
    text = nonwords.sub('', text)
    
    # lower-case everything
    text = text.lower()
    
    return text


count_vectorizer = CountVectorizer(
    strip_accents="unicode", # convert accented chars to non-accented versions
    lowercase=True,
    tokenizer=None,        # None - use the default tokenizer
    preprocessor=custom_preprocessor,     # None - use the default preprocessor
    stop_words="english",
    ngram_range=(1,1),     # min and max range of ngrams
    analyzer="word",       # split the document into words, rather than e.g. characters
    max_df=1.0,            # ignore words with df greater than the value (int represents count, 
                           # float represents proportion of documents)
    min_df=0.007           # ignore words the df lower than the value (int represents count, 
                           # float represents proportion)
)

In [None]:
# create a document-by-word matrix for training data
docs_train_counts = ???

In [None]:
docs_train_counts.shape

In [None]:
# create a document-by-word matrix for test data
docs_test_counts = ???

In [None]:
# transform word counts in each document to TFIDF weights

???

# fit and transform the training set with "fit_transform()"
docs_train_tfidf = ???

# transform test
docs_test_tfidf = ???

In [None]:
# scale the data with MaxAbsScaler

X_train = ???
X_test = ???

# Baseline

Calculate the F-score for the majority baseline (every label is "Customer Service Issue"):

In [None]:
labels, counts = np.unique(y_train, return_counts=True)
total = counts.sum()
fscores = []
for k, v in zip(labels, counts):
    if k == "Customer Service Issue":
        p = v / total
        r = 1.0
        f = 2/(1/p + 1/r)
        fscores.append(f)
        print(f"{k:30} {v:5} fscore: {f:.3}")
    else:
        fscores.append(0)
        print(f"{k:30} {v:5} fscore: 0.0")

macroaveraged_f = ???
print(f"\nMacroaveraged baseline: {macroaveraged_f}")

# Train a model

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
import time
from sklearn.svm import LinearSVC

lsvm = LinearSVC(random_state=7, max_iter=10000)

# specify the hyperparameters and their values
param_grid = {
    'C': [???]
}

# we'll use 5-fold cross-validation
grid_search = GridSearchCV(lsvm, param_grid, cv=5,
                           scoring='f1_macro', 
                           return_train_score=True) 

start = time.time()
grid_search.fit(X_train, y_train)
end = time.time() - start
print(f"Took {end} seconds")

In [None]:
cv_results = pd.DataFrame(grid_search.cv_results_)[['params', 'mean_train_score', 'mean_test_score']]
cv_results["diff, %"] = 100*(cv_results["mean_train_score"]-cv_results["mean_test_score"]
                                                     )/cv_results["mean_train_score"]

pd.set_option('display.max_colwidth', 100)
cv_results.sort_values('mean_test_score', ascending=False)

In [None]:
import matplotlib.pyplot as plt

from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split


def plot_learning_curves(model, X, y):
    
    # create a train-test split
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=7)
    train_errors, val_errors, sizes = [], [], []
    
    # take different sizes of the training data, starting from 50s instance, with the step of 10
    for m in range(50, X_train.shape[0], 100):
        
        # fit a model
        model.fit(X_train[:m], y_train[:m])
        
        # evaluate the model on both training and validation sets
        yhat_train = model.predict(X_train[:m])
        yhat_val = model.predict(X_val)
        
        # calculate F-score and record the results
        p, r, train_fscore, s = precision_recall_fscore_support(yhat_train,
                                                                y_train[:m], average="macro")
        p, r, val_fscore, s = precision_recall_fscore_support(yhat_val, y_val, average="macro")
                
        train_errors.append(train_fscore)
        val_errors.append(val_fscore)
        sizes.append(m)
        
    # make a plot
    temp_df = pd.DataFrame({"training set size": sizes, 
                        "train": train_errors,
                        "val": val_errors})

    temp_df.plot(x="training set size", y=["train", "val"], kind="line", 
             figsize=(6, 4))

In [None]:
lsvm = LinearSVC(???, random_state=7, max_iter=10000)

plot_learning_curves(lsvm, X_train, y_train)

# Evaluate on test

In [None]:
# re-train the best model found during cross-validation

lsvm = LinearSVC(???, random_state=7, max_iter=10000).fit(X_train, y_train)
yhat = lsvm.predict(X_test)

# micro-averaged precision, recall and f-score
p, r, f, s = precision_recall_fscore_support(y_test, yhat, average="macro")
print(f"Precision: {p}")
print(f"Recall: {r}")
print(f"F score: {f}")

In [None]:
import matplotlib
matplotlib.rcParams["figure.figsize"] = (12, 12)

from sklearn.metrics import ConfusionMatrixDisplay

ConfusionMatrixDisplay.from_estimator(lsvm, X_test, y_test,
                                 display_labels=lsvm.classes_,
                                 xticks_rotation=45,
                                 cmap='Blues',
                                 normalize='true')