# **Text Pre Processing**

In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re
data = pd.read_csv('/content/airlines_type.csv')
# Assuming you have a DataFrame df with a 'reviews' column
nltk.download('punkt')
# Download the NLTK stop words list if not already downloaded
nltk.download('stopwords')

# Initialize the Porter Stemmer and stop words
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

# Function to perform text preprocessing
def preprocess_text(text):
    # Encode text as 'utf-8' and remove non-ASCII characters
    text = text.encode('utf-8', 'ignore').decode('utf-8')

    # Remove special characters and symbols
    text = re.sub(r'[^A-Za-z0-9\s\']', '', text)

    # Tokenization
    tokens = word_tokenize(text)

    # Stemming and stop words removal
    tokens = [stemmer.stem(token) for token in tokens if token.lower() not in stop_words]

    return ' '.join(tokens)

# Apply the preprocessing function to the 'reviews' column
data['Preprocessed_Reviews'] = data['Review'].apply(preprocess_text)


FilePath = '/content/airlines_processed.csv'
data.to_csv(FilePath, index = False)

# **Vader Lexicon**

In [None]:
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

data2 = pd.read_csv('/content/airlines_processed.csv')

# Initialize the sentiment analyzer
sia = SentimentIntensityAnalyzer()

#'reviews' is a Pandas DataFrame column containing the reviews
reviews = data2['Preprocessed_Reviews']

# Create an empty list to store sentiment categories
sentiment_scores = []

# Analyze sentiment for each review
for review in reviews:
    sentiment_score = sia.polarity_scores(review)

    sentiment_scores.append(sentiment_score)

# Add the sentiment categories

data2['sentiment_scv'] = sentiment_scores

data2.head()

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


Unnamed: 0,Airline Name,Overall_Rating,Review_Title,Review,Type Of Traveller,Seat Type,Date Flown,Seat Comfort,Cabin Staff Service,Food & Beverages,...,Inflight Entertainment,Wifi & Connectivity,departure,destination,in_usa,destination_country,destination_in_USA,flight_type,Preprocessed_Reviews,sentiment_scv
0,Aer Lingus,3,"""Worst lack of care""",We flew Aer Lingus last summer Seattle to Dubl...,Couple Leisure,Economy Class,8/1/2022,Satisfied,Satisfied,Neutral,...,Neutral,Dissatisfied,Seattle,Dublin,True,Ireland,False,International,flew aer lingu last summer seattl dublin edinb...,"{'neg': 0.097, 'neu': 0.838, 'pos': 0.065, 'co..."
1,Aer Lingus,1,"""This airline is a joke""",Got on the plane and it is so old that there i...,Family Leisure,Economy Class,6/1/2023,Very Dissatisfied,Very Dissatisfied,Very Dissatisfied,...,Dissatisfied,Dissatisfied,Chicago,Dublin,True,Ireland,False,International,got plane old usb outlet burn air flow inform ...,"{'neg': 0.065, 'neu': 0.797, 'pos': 0.138, 'co..."
2,Aer Lingus,1,"""involuntarily denied boarding""",We booked our flight 7 months out and checked ...,Couple Leisure,Economy Class,6/1/2023,Very Dissatisfied,Very Satisfied,Neutral,...,Dissatisfied,Dissatisfied,Boston,Athens,True,Greece,False,International,book flight 7 month check 4 hour earli involun...,"{'neg': 0.04, 'neu': 0.924, 'pos': 0.037, 'com..."
3,Aer Lingus,1,"""customer service is non existent""",If you have to book with them and have no othe...,Couple Leisure,Economy Class,6/1/2023,Very Dissatisfied,Very Dissatisfied,Very Dissatisfied,...,Very Dissatisfied,Very Dissatisfied,New York,Naples,True,Italy,False,International,book option pleas consid buy air tag travel so...,"{'neg': 0.082, 'neu': 0.75, 'pos': 0.168, 'com..."
4,Aer Lingus,1,"""Lost luggage nightmare""",WORSE THAN SPIRIT AIRLINES! Lost luggage night...,Couple Leisure,Economy Class,6/1/2023,Very Dissatisfied,Very Dissatisfied,Very Dissatisfied,...,Very Dissatisfied,Very Dissatisfied,Washington,Naples,True,Italy,False,International,wors spirit airlin lost luggag nightmar easili...,"{'neg': 0.24, 'neu': 0.635, 'pos': 0.125, 'com..."


# **SPLIT**

In [None]:
data2['compound_vader'] = data2['sentiment_scv'].apply(lambda x: x['compound'])
data2.drop(columns=['sentiment_scv'], inplace=True)

tertiles = data2['compound_vader'].quantile([0, 1/3, 2/3, 1])

# Print the calculated tertiles
print("Tertiles:")
print(tertiles)

Tertiles:
0.000000   -0.986200
0.333333   -0.529567
0.666667    0.476700
1.000000    0.996000
Name: compound_vader, dtype: float64


In [None]:
labels = ['Negative', 'Neutral', 'Positive']
data2['category_vader'] = pd.cut(data2['compound_vader'], bins=tertiles, labels=labels)

In [None]:
filepath = '/content/vader.csv'
data2.to_csv(filepath, index=False)

# **TextBlob**

In [None]:
import pandas as pd
from textblob import TextBlob
import matplotlib.pyplot as plt

# Load your dataset into a pandas DataFrame (assuming it's stored in a CSV file)
datatb = pd.read_csv('/content/vader.csv')

# Function to calculate sentiment polarity using TextBlob
def calculate_sentiment(text):
    analysis = TextBlob(text)
    return analysis.sentiment.polarity

# Apply sentiment analysis to the "Review" column
datatb['Sentiment_tb'] = datatb['Preprocessed_Reviews'].apply(calculate_sentiment)

tertiles2 = datatb['Sentiment_tb'].quantile([0, 1/3, 2/3, 1])

# Print the calculated tertiles
print("Tertiles2:")
print(tertiles2)

Tertiles2:
0.000000   -1.000000
0.333333   -0.037500
0.666667    0.087446
1.000000    0.850000
Name: Sentiment_tb, dtype: float64


In [None]:
labels = ['Negative', 'Neutral', 'Positive']
datatb['category_tb'] = pd.cut(datatb['Sentiment_tb'], bins=tertiles2, labels=labels)

In [None]:
filepath = '/content/vadertextblob.csv'
datatb.to_csv(filepath, index = False)

# **Correlation**

In [None]:
df = pd.read_csv('/content/vadertextblob.csv')
correlation_coefficient = df["compound_vader"].corr(df["Sentiment_tb"])

# Print the correlation coefficient
print("Pearson Correlation Coefficient:", correlation_coefficient)

Pearson Correlation Coefficient: 0.5543620636161801
