In [1]:
import pandas as pd 
import numpy as np 
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

In [2]:
nltk.download('wordnet')
!unzip  /usr/share/nltk_data/corpora/wordnet.zip -d  /usr/share/nltk_data/corpora/

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Archive:  /usr/share/nltk_data/corpora/wordnet.zip
   creating: /usr/share/nltk_data/corpora/wordnet/
  inflating: /usr/share/nltk_data/corpora/wordnet/lexnames  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adv  
  inflating: /usr/share/nltk_data/corpora/wordnet/adv.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/cntlist.rev  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/LICENSE  
  inflating: /usr/share/nltk_data/corpora/wordnet/citation.bib  
  inflating: /usr/share/nltk_data/corpora/wordnet/noun.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/verb.exc  
  inflating: /usr/shar

In [3]:
text = pd.read_csv('/kaggle/input/airline/text_merged.csv')
text.head()

Unnamed: 0,Review id,Review header,Review,Recommended
0,anchor885571,"""very terrible experience""",✅ Trip Verified | I bought roundtrip and retu...,no
1,anchor881710,"""very concerned about the safety of Aeroflot""",✅ Trip Verified | I am shocked at how far Aer...,no
2,anchor767446,"""felt very rushed and unpolished""",✅ Trip Verified | Aeroflot has set high stand...,no
3,anchor758822,"""Best airline in Russia""",Not Verified | Best airline in Russia! Very n...,yes
4,anchor754420,"""cabin crew were outstanding""",✅ Trip Verified | I was very impressed with t...,yes


In [4]:
text.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57932 entries, 0 to 57931
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Review id      57932 non-null  object
 1   Review header  57932 non-null  object
 2   Review         57932 non-null  object
 3   Recommended    57932 non-null  object
dtypes: object(4)
memory usage: 1.8+ MB


## 0. Convert to ASCII code

In [5]:
import unicodedata

def remove_accented_chars(x):
    """
    The function changes the accented characters into their equivalent normal form,
    to do so, the normalize function with 'NFKD' is used which replaces the compatibility characters into
    their equivalent.

    Parameters:
    - x (str): the sentence in which accented characters are to be detected and removed

    Returns:
    - x (str): sentence with accented characters replaced by their equivalent
    """
    x = unicodedata.normalize('NFKD', x).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return x

text['Review'] = text['Review'].apply(remove_accented_chars)
text['Review'] = text['Review'].str.replace(r'_', ' ', regex=True)

In [6]:
remove_accented_chars('Khoa Học Dữ liệu')

'Khoa Hoc Du lieu'

## Replace entities

In [7]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [8]:
# GPE: Geopolitical Entity
import spacy
from tqdm import tqdm
tqdm.pandas()

nlp = spacy.load("en_core_web_lg")

def replace_entities_with_keywords(text):
    doc = nlp(text)
    entity_types = ["TIME", "DATE", "QUANTITY", "MONEY", "FLIGHT_CODE", "AIRLINE", 
                    "AIRCRAFT", "GPE", "LANGUAGE", "AIRPORT", "CITY"]

    replaced_text = []

    for token in doc:
        if token.ent_type_ in entity_types:
            replaced_text.append('_' + token.ent_type_.lower() + '_')
        else:
            replaced_text.append(token.text)

    return ' '.join(replaced_text)
    

text['Review'] = text['Review'].progress_apply(replace_entities_with_keywords)

100%|██████████| 57932/57932 [38:54<00:00, 24.82it/s]


In [9]:
entity_types = ["TIME", "DATE", "QUANTITY", "MONEY", "FLIGHT_CODE", "AIRLINE", 
                "AIRCRAFT", "GPE", "LANGUAGE", "AIRPORT", "CITY"]
for t in entity_types:
    t_re = r'(?:_' + t.lower() + '_\s){2}'
    t = '_' + t.lower() + '_ '
    text['Review'] = text['Review'].str.replace(t_re, t, regex=True)

In [10]:
import pickle

with open('/kaggle/working/review_replace_ent.pkl', 'wb') as f:
    pickle.dump(list(text['Review']), f)

## 1. Remove punctuation and Icon

In [11]:
import re

def decontracted(phrase):
    if isinstance(phrase, str):
        # specific
        phrase = re.sub(r"shan[' ]?t(?:\W|$)", "shall not", phrase)
        phrase = re.sub(r"won[' ]?t(?:\W|$)", "will not", phrase)
        phrase = re.sub(r"can[' ]?t(?:\W|$)", "can not", phrase)
        phrase = re.sub(r"would", "will", phrase)
        phrase = re.sub(r"could", "can", phrase)
        
        lst = [r'do', r'did', r'is', r'are', r'am', r'must', r'should',
               r'may', r'does', r'has', r'have']
        
        for case in lst:
            phrase = re.sub(case + r"n[' ]?t(?:\W|$)", case + " not", phrase)
        
        # general
        phrase = re.sub(r" nt(?:\W|$)", " not ", phrase)
        phrase = re.sub(r"[\s']re(?:\W|$)", " are ", phrase)
        phrase = re.sub(r"[\s']s(?:\W|$)", " is ", phrase)
        phrase = re.sub(r"[\s']d(?:\W|$)", " would ", phrase)
        phrase = re.sub(r"[\s']ll(?:\W|$)", " will ", phrase)
        phrase = re.sub(r"[\s']ve(?:\W|$)", " have ", phrase)
        phrase = re.sub(r"[\s']m(?:\W|$)", " am ", phrase)
        return phrase

In [12]:
def remove_punctuation(text):
    text = text.str.lower()
#     text = text.str.replace(r'_', ' ', regex=True)
    text = text.str.replace(r'\W', ' ', regex=True)
    text = text.str.replace(r'\d', ' ', regex=True)
    text = text.str.replace(r'\s+', ' ', regex=True)
    text = text.str.strip()
    text = text.str.findall(r'(?:\w+? ?verified )?(.*)').str[0]
    text = text.str.findall(r'(?:verified review )?(.*)').str[0]
    return text

In [13]:
text['Review'] = text['Review'].apply(decontracted)
text['Review'] = remove_punctuation(text['Review'])

In [14]:
i = 0
text['Review'][i:i+50]

0     i bought roundtrip and return tickets from _gp...
1     i am shocked at how far aeroflot standards hav...
2     aeroflot has set high standards of achieving a...
3     best airline in _gpe_ very nice staff comforta...
4     i was very impressed with the staff many thank...
5     i missed my flight due to a confusion with the...
6     flight quite uneventful although the hard prod...
7     positives convenient booking via mobile app ve...
8     aeroflot has a good planes and good pilots but...
9     the premium economy service was entirely satis...
10    horrible experience at sheremetyevo airport hu...
11    london to _gpe_ on brand new a i must say i ha...
12    _gpe_ to _gpe_ on aeroflot b i always wanted t...
13    _gpe_ to _gpe_ via _gpe_ in _date_ was a night...
14    _gpe_ to _gpe_ i really liked the service clea...
15    _gpe_ to _gpe_ a new a plane with business cla...
16    _gpe_ to _gpe_ via _gpe_ i paid _money_ for up...
17    _gpe_ to _gpe_ business class was very out

## 2. Tokenize

In [15]:
nltk_tokens_word  = text['Review'].apply(lambda x: nltk.word_tokenize(x))

In [16]:
print(nltk_tokens_word)

0        [i, bought, roundtrip, and, return, tickets, f...
1        [i, am, shocked, at, how, far, aeroflot, stand...
2        [aeroflot, has, set, high, standards, of, achi...
3        [best, airline, in, _gpe_, very, nice, staff, ...
4        [i, was, very, impressed, with, the, staff, ma...
                               ...                        
57927    [had, a, terrible, experience, at, check, in, ...
57928    [one, of, the, better, low, cost, airlines, bu...
57929    [rynair, is, a, luxury, comparing, to, wizz, a...
57930    [we, went, to, ciampino, airport, _time_, earl...
57931    [wizz, air, is, hands, down, the, worst, airli...
Name: Review, Length: 57932, dtype: object


## 3. Remove stopwords

In [17]:
stop_words = set(stopwords.words('english')) 
def remove_stop_words(token_words):
    filtered_words = [word for word in token_words if word not in stop_words]
    return filtered_words

In [18]:
filter_words = nltk_tokens_word.apply(remove_stop_words)

In [19]:
print(filter_words)

0        [bought, roundtrip, return, tickets, _gpe_, _g...
1        [shocked, far, aeroflot, standards, fallen, si...
2        [aeroflot, set, high, standards, achieving, st...
3        [best, airline, _gpe_, nice, staff, comfortabl...
4        [impressed, staff, many, thanks, aeroflot, rep...
                               ...                        
57927    [terrible, experience, check, wife, _date_, fr...
57928    [one, better, low, cost, airlines, need, make,...
57929    [rynair, luxury, comparing, wizz, air, charge,...
57930    [went, ciampino, airport, _time_, early, tried...
57931    [wizz, air, hands, worst, airline, areas, cust...
Name: Review, Length: 57932, dtype: object


## 4. Lemmatize

In [20]:
lemmatizer = WordNetLemmatizer()
def lemma(filtered_tokens):
    lemmatized_tokens = []
    lst_pos = ['a', 's', 'r', 'n', 'v']
    for word in filtered_tokens:
        for p in lst_pos:
            lemma_word = lemmatizer.lemmatize(word, pos=p)
            if lemma_word != word:
                break
        lemmatized_tokens.append(lemma_word)
    return lemmatized_tokens

In [21]:
lemmatized_words = filter_words.apply(lemma)

In [22]:
text['Review_cleaned'] = lemmatized_words.apply(lambda x: " ".join(x))
# text['Review_cleaned'] = lemmatized_words

## 5. Tạo bigrams

In [23]:
import gensim

In [24]:
bigram = gensim.models.Phrases(list(lemmatized_words), min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[list(lemmatized_words)], threshold=100)  
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [25]:
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

In [26]:
text['Review_bigrams'] = make_bigrams(lemmatized_words)
text['Review_bigrams'] = text['Review_bigrams'].apply(lambda x: " ".join(x))

In [27]:
text['Review_trigrams'] = make_trigrams(make_bigrams(lemmatized_words))
text['Review_trigrams'] = text['Review_trigrams'].apply(lambda x: " ".join(x))

In [30]:
text['Review_trigrams']

0        buy roundtrip return ticket _gpe_ _gpe_ via _g...
1        shock far aeroflot standard fall since invade ...
2        aeroflot set high standard achieve star airlin...
3        best airline _gpe_ nice staff comfortable seat...
4        impress staff many thank aeroflot representati...
                               ...                        
57927    terrible experience check wife _date_ frequent...
57928    one good low cost airline need make sure organ...
57929    rynair luxury compare wizz air charge every th...
57930    go ciampino airport _time_ early try open wizz...
57931    wizz air hand bad airline area customer servic...
Name: Review_trigrams, Length: 57932, dtype: object