In [107]:
import mysql.connector
import pandas as pd
import os
from dotenv import load_dotenv
import re

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

from imblearn.over_sampling import SMOTE
from collections import Counter

from sklearn.utils import resample

import joblib

In [108]:
nltk.download("punkt")
nltk.download("averaged_perceptron_tagger_eng")
nltk.download("wordnet")
nltk.download("stopwords")
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt to /Users/veronica/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/veronica/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/veronica/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/veronica/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/veronica/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [109]:
load_dotenv()

conn = mysql.connector.connect(
    host=os.getenv("DB_HOST"),
    user=os.getenv("DB_USER"),
    password=os.getenv("DB_PASSWORD"),
    database=os.getenv("DB_NAME")
)

query = """
SELECT 
    tw.wish_text,
    sr.travel_reason
FROM travel_wishes AS tw
JOIN survey_responses AS sr
ON tw.survey_id = sr.id
WHERE sr.travel_reason IS NOT NULL 
AND tw.wish_text NOT LIKE "%No answer%";
"""

df = pd.read_sql(query, conn)

conn.close()

  df = pd.read_sql(query, conn)


In [110]:
df

Unnamed: 0,wish_text,travel_reason
0,I'd like to go cultural exchange in Switzerlan...,Adventure
1,I want to go attending a music festival in Spa...,Cultural Immersion
2,"Unfortunately, I can't afford to go hiking in ...",Budget-friendly options
3,I'm thinking about go visiting art museums in ...,Adventure
4,I'd like to go scuba diving in South Africa fo...,Budget-friendly options
...,...,...
969,I'm considering go exploring ancient ruins in ...,Cultural Immersion
970,My dream is to go cultural exchange in Canada ...,Cultural Immersion
971,My dream is to go visiting national parks in C...,Relaxation
972,I'm so excited about go visiting art museums i...,Relaxation


In [111]:
## LEMMATIZATION

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):

    text = text.lower()
    
    negations = ["not", "no", "never", "n't", "none"]
    tokens = word_tokenize(text)
    for i in range(len(tokens)):
        if tokens[i] in negations and i + 1 < len(tokens):
            tokens[i + 1] = "NOT_" + tokens[i + 1]
    text = ' '.join(tokens)

    text = re.sub(r'\W+', ' ', text)

    lemmatized_text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])

    return lemmatized_text

df['processed_wish'] = df['wish_text'].apply(preprocess_text)



In [112]:
df[['wish_text', 'processed_wish']].head()

Unnamed: 0,wish_text,processed_wish
0,I'd like to go cultural exchange in Switzerlan...,like go cultural exchange switzerland food exp...
1,I want to go attending a music festival in Spa...,want go attending music festival spain adventu...
2,"Unfortunately, I can't afford to go hiking in ...",unfortunately ca n NOT_afford go hiking mounta...
3,I'm thinking about go visiting art museums in ...,thinking go visiting art museum japan food exp...
4,I'd like to go scuba diving in South Africa fo...,like go scuba diving south africa family frien...


In [113]:
df

Unnamed: 0,wish_text,travel_reason,processed_wish
0,I'd like to go cultural exchange in Switzerlan...,Adventure,like go cultural exchange switzerland food exp...
1,I want to go attending a music festival in Spa...,Cultural Immersion,want go attending music festival spain adventu...
2,"Unfortunately, I can't afford to go hiking in ...",Budget-friendly options,unfortunately ca n NOT_afford go hiking mounta...
3,I'm thinking about go visiting art museums in ...,Adventure,thinking go visiting art museum japan food exp...
4,I'd like to go scuba diving in South Africa fo...,Budget-friendly options,like go scuba diving south africa family frien...
...,...,...,...
969,I'm considering go exploring ancient ruins in ...,Cultural Immersion,considering go exploring ancient ruin switzerl...
970,My dream is to go cultural exchange in Canada ...,Cultural Immersion,dream go cultural exchange canada mountain exp...
971,My dream is to go visiting national parks in C...,Relaxation,dream go visiting national park canada beach e...
972,I'm so excited about go visiting art museums i...,Relaxation,excited go visiting art museum new zealand foo...


In [114]:
# LABELLING SENTIMENTS

analyzer = SentimentIntensityAnalyzer()

def get_sentiment(text):
    sentiment_score = analyzer.polarity_scores(text)
    if sentiment_score['compound'] >= 0.02:
        return 'excited'
    elif sentiment_score['compound'] <= -0.02:
        return 'negative'
    else:
        return 'neutral'

df['sentiment'] = df['processed_wish'].apply(get_sentiment)

In [115]:
df

Unnamed: 0,wish_text,travel_reason,processed_wish,sentiment
0,I'd like to go cultural exchange in Switzerlan...,Adventure,like go cultural exchange switzerland food exp...,excited
1,I want to go attending a music festival in Spa...,Cultural Immersion,want go attending music festival spain adventu...,excited
2,"Unfortunately, I can't afford to go hiking in ...",Budget-friendly options,unfortunately ca n NOT_afford go hiking mounta...,excited
3,I'm thinking about go visiting art museums in ...,Adventure,thinking go visiting art museum japan food exp...,neutral
4,I'd like to go scuba diving in South Africa fo...,Budget-friendly options,like go scuba diving south africa family frien...,excited
...,...,...,...,...
969,I'm considering go exploring ancient ruins in ...,Cultural Immersion,considering go exploring ancient ruin switzerl...,negative
970,My dream is to go cultural exchange in Canada ...,Cultural Immersion,dream go cultural exchange canada mountain exp...,excited
971,My dream is to go visiting national parks in C...,Relaxation,dream go visiting national park canada beach e...,excited
972,I'm so excited about go visiting art museums i...,Relaxation,excited go visiting art museum new zealand foo...,excited


In [116]:
df['sentiment'].value_counts()

sentiment
excited     765
negative    129
neutral      80
Name: count, dtype: int64

In [117]:
# VECTORIZATION - CAPTURING IMPORTANT WORDS

vectorizer = TfidfVectorizer(max_features=1000)


In [118]:
# TRAINING THE MODEL

X = vectorizer.fit_transform(df["processed_wish"])
y = df["sentiment"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [119]:
print(y_train.value_counts())

sentiment
excited     612
negative    103
neutral      64
Name: count, dtype: int64


In [120]:
# BALANCE DATA

smote = SMOTE(random_state=42, k_neighbors=2)

X_train_dense = X_train.toarray()

X_train_balanced, y_train_balanced = smote.fit_resample(X_train_dense, y_train)

In [121]:
model = LogisticRegression()
model.fit(X_train_balanced, y_train_balanced)

In [122]:
X_test_dense = X_test.toarray()
y_pred = model.predict(X_test_dense)

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

     excited       1.00      0.88      0.94       153
    negative       0.67      1.00      0.80        26
     neutral       0.71      0.94      0.81        16

    accuracy                           0.90       195
   macro avg       0.79      0.94      0.85       195
weighted avg       0.93      0.90      0.91       195



In [123]:
# PREDICTION FOR CURRENT USER

def predict_sentiment(new_wish):
    processed_wish = preprocess_text(new_wish)  
    vectorized_wish = vectorizer.transform([processed_wish])
    prediction = model.predict(vectorized_wish)
    return prediction[0]

In [125]:
# EXPORT MODEL AND VECTORIZER

joblib.dump(model, 'pkl/sentiment_model.pkl')
joblib.dump(vectorizer, 'pkl/sentiment_vectorizer.pkl')

['pkl/sentiment_vectorizer.pkl']