# General overview

This notebook presents our work in analyzing the feelings of aircraft equipment. 
The objective of this project is to find out what people think of this equipment in order to improve customer service on the aircraft concerned.
To achieve this, we have divided our work into two main parts: 
- A part of pre-processing our data and obtaining what is important to us; to do this, we examine a significant sample of 3,000 comments in order to obtain the main subjects of each comment. 
- A sentiment analysis model; based on these topics, we look at all the comments to get people's opinions.

# Import the data

In [None]:
!pip install aspect_based_sentiment_analysis

In [4]:
import tensorflow as tf
import aspect_based_sentiment_analysis as absa
from collections import Counter
from collections import Iterable
import itertools
import numpy as np
import nltk
import spacy
import pandas as pd

In [None]:
#  To improve computation speed - run on GPU
get_ipython().run_line_magic('tensorflow_version', '2.x')
device_name = tf.test.gpu_device_name()

if device_name != '/device:GPU:0':
    raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

In [None]:
#  Load up our dataset
col_names = [
    "review_ID",
    "date_published",
    "global_ratings",
    "reviews_titles",
    "customers_countries",
    "reviews_body",
    "is_verified",
    "aircraft",
    "type_traveller",
    "seat_type",
    "route_provenance",
    "route_destination",
    "date_flown",
    "seat_comfort",
    "food_beverages",
    "cabin_staff_service",
    "sleep_comfort",
    "sitting_comfort",
    "seat_width",
    "seat_length",
    "seat_privac",
    "power_supply",
    "seat_storage",
    "is_recommended",
    "is_airline_review",
    "airline_name"]

df = pd.read_csv(
    "airlines_dataset_exhaustive.csv",
    names=col_names,
    skiprows=0,
    delimiter=";",
    low_memory=False,
    encoding='utf-8')
df = df.iloc[1:]

#  Print our data frame
df

# Preprocess our data

In [7]:
#  Load NLP
nlp = spacy.load("en_core_web_sm")

## 1) Get the main aspects based on the column 'is_airline_review'

In [8]:
#  Create a function to flat our list
def flatten(lis):
    for item in lis:
        if isinstance(item, Iterable) and not isinstance(item, str):
            for x in flatten(item):
                 yield x
        else:        
             yield item

In [9]:
#  Lemmatize a column in pandas
def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

In [None]:
#  Select only reviews with is_airline_review = 0
with tf.device('/device:GPU:0'):
    df_reduce = df[df['is_airline_review'] == '0']

    # Create a column 'aspect terms' containing the aspects of each comments
    # (based on the nouns used in a comment)
    df_reduce.reviews_body = df_reduce.reviews_body.str.lower()

    aspect_terms = []
    for review in nlp.pipe(df_reduce.reviews_body):
        chunks = [(chunk.root.text)
                  for chunk in review.noun_chunks if chunk.root.pos_ == 'NOUN']
        aspect_terms.append(' '.join(chunks))
    df_reduce['aspect_terms'] = aspect_terms
    df_reduce.head(10)

    #  Lemmatizate the 'aspect_terms' columns to clean the words
    nltk.download('wordnet')
    w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
    lemmatizer = nltk.stem.WordNetLemmatizer()

    df_reduce['aspect_terms'] = df_reduce.aspect_terms.apply(lemmatize_text)

    # Convert to list
    different_aspects = df_reduce['aspect_terms'].tolist()
    different_aspects = list(flatten(different_aspects))
    split_on_these = [' ']
    allwords = []

    # Convert comments to unique words
    for i in range(len(different_aspects)):
        words = different_aspects[i].split()
        allwords.append(words)

    #  Flat our list and get the most frequent words
    allwords = list(flatten(allwords))
    x = Counter(allwords)
    most_common = x.most_common()

    #  Get the 500 words which occur the most
    top_500 = most_common[0:500]

    # Get the most common elements on list format
    first_tuple_elements = []

    for a_tuple in top_500:
        first_tuple_elements.append(a_tuple[0])

# Emport the top 500 words into .csv
export = pd.DataFrame(first_tuple_elements)
export.to_csv(r"file.csv")

We exported the first 500 words to an Excel spreadsheet. Then we manually selected 40 words that were significant for our aspect based analysis. This selection was based on common sense and no NLP algorithm could have helped us accomplish this task;
Many words appeared in different handwritings, many synonyms were present, some words had no meaning, and other words were not related to the equipment... which was useless considering our client.
We then did our sentiment analysis based on these 40 words. 

In [12]:
#  Import our 40 chosen words
words_chosen = pd.read_csv("words.csv", header=None)

#  Convert to list
words_chosen = words_chosen[0].tolist()

## 2) Get aspect terms on the whole dataset

Now that we have our chosen words (our topics), we can retrieve on the whole dataset the sentiment of people on these words. 
On this part, we kept only reviews containing at least one aspect present in our chosen words.

In [None]:
# Get the aspect terms of each comments based on the nouns : ATTENTION
# about 1h30 running on CPU
with tf.device('/device:GPU:0'):
    df.reviews_body = df.reviews_body.str.lower()

    aspect_terms = []
    for review in nlp.pipe(df.reviews_body):
        chunks = [(chunk.root.text)
                  for chunk in review.noun_chunks if chunk.root.pos_ == 'NOUN']
        aspect_terms.append(' '.join(chunks))
    df['aspect_terms'] = aspect_terms

    #  Lemmatize 'aspect_terms' on the whole dataset
    df['aspect_terms_lem'] = df.aspect_terms.apply(lemmatize_text)

    #  Create a new column containing only words of interest
    df = df.assign(relevant_aspects=[
                  [x for x in y if x in words_chosen] for y in df.aspect_terms_lem])

    #  Get only unique values
    for i in range(1, len(df)):
        df['relevant_aspects'][i] = list(set(df['relevant_aspects'][i]))

    #  Convert list to string
    df['relevant_aspects'] = df.relevant_aspects.apply(
        lambda x: ', '.join([str(i) for i in x]))

    #  Keep only rows with aspects in words chosen
    airlines_comment = df[df['relevant_aspects'] != '']

    #  Lemmatizate comments
    airlines_comment['reviews_body_lem'] = airlines_comment.reviews_body.apply(
        lemmatize_text)

    #  Convert list to string
    airlines_comment['reviews_body_lem'] = airlines_comment.reviews_body_lem.apply(
        lambda x: ' '.join([str(i) for i in x]))

    #  Reorganised columns
    airlines_comment['reviews_body'] = airlines_comment['reviews_body_lem']
    airlines_comment = airlines_comment.drop(
        ['reviews_body_lem', 'aspect_terms_lem'], axis=1)

# Model - sentiment analysis

On this part, we got the opinion of people regarding the chosen words based on their comments. 
The sentiment analysis model come from the 'aspect_based_sentiment_analysis' library, which is the very well-known BERT model for its prediction.

In [16]:
def get_relevant_qualifiers(aspect, k):
    weights = np.array(aspect.review.patterns[0].weights)
    idx = np.argpartition(weights, -k)[-k:]
    text = np.array(aspect.text_tokens)
    return text[idx]

In [None]:
#  Load aspect based sentiment analysis 
nlp = absa.load()

In [None]:
#Apply our model - 4h running on CPU

#  Get only columns needed for the analysis
comments = airlines_comment['reviews_body']
aspects = airlines_comment['relevant_aspects']
ids = airlines_comment['review_ID']

#  Convert to list to avoir error iteration
ids = list(ids)
comments = list(comments)
aspects = list(aspects)

#  Apply sentiment analysis on our reviews
with tf.device('/device:GPU:0'):
    recognizer = absa.aux_models.BasicPatternRecognizer()
    nlp = absa.load(pattern_recognizer=recognizer)
    result = []

    for i in range(len(airlines_comment)):
        comment = comments[i][:512]  # Limit commments to 512 caracters
        aspect = list(str(aspects[i]).split(", "))
        # Get only 4 aspects max by review (computation)
        n = min(4, len(aspect))
        completed_task = nlp(text=str(comment), aspects=aspect[0:n])
        aspect1 = completed_task.examples
        result_per_aspect = []

        for j in range(n):
            score = max(aspect1[j].scores)
            tendance = aspect1[j].scores.index(score)
            words = get_relevant_qualifiers(aspect1[j], 3)
            result_per_aspect.append((aspect[j], tendance, score, words))

        result.append((i, result_per_aspect))

In [None]:
#  Get the final result in data frame
rev = []
asp = []
tendance = []
scor = []
context1 = []
context2 = []
context3 = []

for i in range(len(result)):
    nb_aspect = len(result[i][1])
    for j in range(nb_aspect):
        # We add nb_aspect time the review_id on the review_id column
        rev.append(result[i][0])
        asp.append(result[i][1][j][0])

    #  Create column to put negativity, neutrality and postivity scores
    trend = result[i][1][j][1]
    if (trend == 0):
        tendance.append("NEGATIVE")
    elif (trend == 1):
        tendance.append("NEUTRAL")
    elif (trend == 2):
        tendance.append("POSITIVE")

    # Get the score for each type of sentiment (neutral, negative and
    # positive)
    scor.append(result[i][1][j][2])
    context1.append(result[i][1][j][3][0])
    context2.append(result[i][1][j][3][1])
    context3.append(result[i][1][j][3][2])

#  Set data frame
d = {
    'review_id': rev,
    'aspect': asp,
    'tendance': tendance,
    'score': scor,
    'context_1': context1,
    'context_2': context2,
    'context_3': context3}
final_df = pd.DataFrame(data=d)


In [None]:
#  Print the final result
final_df