In [None]:
!pip install deep_translator
!pip install nltk
!pip install matplotlib
!pip install wordcloud

In [None]:
!pip install numpy

In [None]:
!pip install -U scikit-learn pandas

In [None]:
!pip install -U openai

In [1]:
import boto3
from abc import abstractmethod
import collections
from datetime import datetime
from deep_translator import GoogleTranslator
from dotenv import load_dotenv
import joblib
from langchain_core.prompts import PromptTemplate
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException
import matplotlib.pyplot as plt
import nltk 
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize 
from nltk.stem import PorterStemmer
import numpy as np
nltk.download('punkt')
nltk.download('punkt_tab')
import openai
import os
import pandas as pd
import pickle
import re
import requests
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
import string
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from wordcloud import WordCloud

[nltk_data] Downloading package punkt to /Users/xuanli/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/xuanli/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [2]:
load_dotenv()

True

In [3]:
# posts_df = pd.read_json('../data/reddit_posts.json')
# comments_df = pd.read_json('../data/reddit_comments.json')

In [4]:
# Get data from S3 bucket - temporary 
from io import BytesIO
import json

s3 = boto3.client('s3')

def load_json(prefix):
    response = s3.list_objects_v2(Bucket="is459-project-data", Prefix=prefix)
    json_files = [obj['Key'] for obj in response['Contents'] if obj['Key'].endswith('.json')]
    combined_data = []

    for file_key in json_files:
        obj = s3.get_object(Bucket="is459-project-data", Key=file_key)
        data = json.load(BytesIO(obj['Body'].read()))
        if isinstance(data, list):
            combined_data.extend(data)
        else:
            combined_data.append(data)
            
    df = pd.DataFrame(combined_data)
    return df

posts_df = load_json("reddit/posts/")
comments_df = load_json("reddit/comments/")

In [5]:
posts_df.shape

(5209, 8)

In [6]:
comments_df.shape

(122499, 7)

In [7]:
posts_df = posts_df.replace("", np.nan)
posts_df.dropna(inplace=True)

comments_df = comments_df.replace("", np.nan)
comments_df.dropna(inplace=True)

In [8]:
airlines = {
    'SouthwestAirlines': 'WN', 
    'Southwest_Airlines': 'WN', 
    'AmericanAir': 'AA',
    'DeltaAirlines': 'DL',
    'HawaiianAirlines': 'HA',
    'frontierairlines': 'F9',
    'delta': 'DL'
}

In [9]:
posts_df['Code'] = posts_df['subreddit'].map(airlines)

In [10]:
posts_df = posts_df.drop_duplicates(subset="id", keep="first")

In [11]:
comments_df = comments_df.drop_duplicates(subset="id", keep="first")

In [12]:
posts_df.shape

(3496, 9)

In [13]:
comments_df.shape

(118698, 7)

In [14]:
code_post_dict = posts_df.set_index('id')['Code'].to_dict()
comments_df['Code'] = comments_df['post_id'].map(code_post_dict)

In [15]:
posts_df = posts_df.dropna()
comments_df = comments_df.dropna()

In [16]:
DetectorFactory.seed = 42
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/xuanli/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
translator = GoogleTranslator(source='auto', target='english')

In [18]:
def is_english(text):
    """
    Check if text is in English

    Args:
    text (str): text to check
    """
    try:
        return detect(text) == 'en'
    except LangDetectException:
        return False

def chunk_text(text, max_length=5000):
    chunks = []
    while len(text) > max_length:
        split_index = text[:max_length].rfind(' ')
        if split_index == -1:
            split_index = max_length
        chunks.append(text[:split_index])
        text = text[split_index:].strip()
    chunks.append(text)
    return chunks

def translate_text(text):
    try:
        if not is_english(text):
            if len(text) > 5000:
                chunks = chunk_text(text)
                translated_chunks = [translator.translate(chunk) for chunk in chunks]
                return ' '.join(translated_chunks)
            else:
                return translator.translate(text)
        else:
            return text
    except Exception as e:
        print(f"Error translating text: {e}")
        return text

In [19]:
def preprocess_text(text):
    """
    Preprocess text by converting to lowercase, removing numbers, punctuation, and stopwords

    Args:
    text (str): text to preprocess

    Returns:
    text (str): preprocessed text
    """
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = ' '.join([stemmer.stem(word) for word in text.split() if word not in stop_words])

    return text

In [20]:
def get_aspect(df, vectorizer=None, lda_model=None, topic_dict=None):
    """
    Get aspect of text using LDA model.

    Args:
    text (str): text to extract aspect from
    vectorizer (object): vectorizer object
    lda_model (object): lda model object

    Returns:
    str: dominant aspect of text
    """
    tfidf_vector = vectorizer.transform(df['content'])
    aspects = lda_model.transform(tfidf_vector)
    dominant_aspect = aspects.argmax(axis=1)
    df['topic'] = pd.Series(dominant_aspect).apply(lambda x: list(topic_dict.keys())[x])
    df['topic'] = df['topic'].str.replace(f"[{string.punctuation}\d]", "", regex=True)
    return aspects

In [21]:
class Dataset:
    def __init__(self, object, dataset_name=None, vectorizer=None, lda_model=None, vader_model=None, topic_dict=None) -> None:
        self.name = dataset_name
        # main template method
        self.data = self.parse(object)
        # insert hook: if vectorizer, lda_model and topic_dict are not provided, prepare them
        if not vectorizer or not lda_model or not topic_dict or not vader_model:
            self.prepare_ABSA()
        else:
            self.vectorizer = vectorizer
            self.lda_model = lda_model
            self.topic_dict = topic_dict
            self.vader_model = vader_model
        self.perform_ABSA()
    
    # MAIN TEMPLATE METHODS
    def prepare_ABSA(self):
        """
        Prepare ABSA by setting up vectorizer and LDA model

        Returns:
        Modifies self.vectorizer, self.lda_model, self.vader_model and self.topic_dict
        """
        print("Preparing dataset for ABSA...")
        self.prepare_vectorizer()
        self.prepare_lda_model()
        self.prepare_vader_model()
        print("Dataset prepared for ABSA")

    def perform_ABSA(self):
        """
        Perform ABSA on text data

        Returns:
        Modifies self.data containing "content", "sentiment" and "aspect" columns
        """
        print("Performing ABSA...")
        print("Extracting aspects...")
        self.aspects = self.extract_aspect()
        print("Getting sentiment...")
        self.get_sentiment()
        print("ABSA completed")
        

    # FUNCTIONAL METHODS
    def prepare_vectorizer(self):
        """
        Prepare vectorizer for text data

        Returns:
        Modifies self.X, self.vectorizer and self.feature_names
        """
        print(f"Preparing vectorizer...")
        # initialize and train vectorizer
        vectorizer = TfidfVectorizer(max_features=50, ngram_range=(1, 2))
        self.X = vectorizer.fit_transform(self.data['content'])
        self.vectorizer = vectorizer
        # retrieve feature names
        self.feature_names = vectorizer.get_feature_names_out()

        # save vectorizer
        with open(f'../models/{self.name}_vectorizer.pkl', 'wb') as f:
            pickle.dump(vectorizer, f)
        print(f"Vectorizer saved as {self.name}_vectorizer.pkl")
        return

    def prepare_lda_model(self):
        """
        Prepare LDA model, extract topics and generate titles using chatgpt

        Returns:
        Modifies self.lda_model and self.topic_dict
        """
        print(f"Preparing LDA model...")
        # initialize all dependencies for lda model
        topic_dict = collections.defaultdict(list)
        openai_model = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
        lda_model = LatentDirichletAllocation(n_components=5, random_state=42)
        lda_model.fit(self.X)
        self.lda_model = lda_model

        # document_topics = lda_model.transform(self.X)
        # dominant_topic = document_topics.argmax(axis=1)
        
        # get the top 50 features for each topic
        topics = self.lda_model.components_

        for idx, topic in enumerate(topics):
            top_features = [self.feature_names[j] for j in topic.argsort()[:-20]]
            # feed chatgpt the top 20 features and generate a title
            prompt = f"""Generate a unique noun phrase or one-word topic for posts that contain the following features. 
            This topic will be used for Aspect-Based Sentiment Analysis on social media data. 
            Ensure the topic is different from previously generated topics. 
            Feature names:\n{", ".join(top_features)}\nTopic:"""
            prompt = PromptTemplate.from_template(prompt)
            response = openai_model.chat.completions.create(
                model="gpt-4o-mini",
                messages=[{"role": "user", "content": prompt.template}],
                max_tokens=10,
                temperature=1,
            )

            title = response.choices[0].message.content.strip()
            # deal with duplicate titles
            if title in topic_dict:
                title = title + "_" + str(idx)
            # add title to topic dictionary
            topic_dict[title] = [self.feature_names[i] for i in topic.argsort()]

        self.topic_dict = topic_dict

        # save lda model and topic dictionary
        with open(f'../models/{self.name}_lda_model.pkl', 'wb') as f:
            pickle.dump(lda_model, f)
        with open(f'../data/{self.name}_topic_dict.pkl', 'wb') as f:
            pickle.dump(topic_dict, f)

        print(f"LDA model saved as {self.name}_lda_model.pkl")
        print(f"Topic dictionary saved as {self.name}_topic_dict.pkl")
        return

    def prepare_vader_model(self):
        """
        Prepare VADER model for sentiment analysis

        Returns:
        Modifies self.vader_model
        """
        print(f"Preparing VADER model...")
        self.vader_model = SentimentIntensityAnalyzer()
        with open(f'../models/{self.name}_vader_model.pkl', 'wb') as f:
            pickle.dump(self.vader_model, f)

        print(f"VADER model saved as {self.name}_vader_model.pkl")
        return

    @abstractmethod
    def parse(self, json_object: object) -> object:
        """
        Abstract method to parse JSON object to be implemented by child class.

        Return:
        dataframe containing "content" column
        """

    def extract_aspect(self):  #check what does extract aspect do 
        """
        Extract aspects from self.data using LDA model

        Returns:
        list: list of dominant aspects in self.data
        """
        print("Extracting aspects")
        # vectorize text
        return get_aspect(self.data, self.vectorizer, self.lda_model, self.topic_dict)
    
    def get_sentiment(self):
        """
        Get sentiment of text using VADER

        Returns:
        float: sentiment score
        """
        self.data['sentiment'] = self.data['content'].apply(lambda x: self.vader_model.polarity_scores(x)['compound'])
        return

    # def generate_word_clouds(self):
    #     """
    #     Generate word clouds for each topic in the topic dictionary.
    #     """
    #     if not hasattr(self, 'topic_dict') or not self.topic_dict:
    #         print("Topic dictionary is not defined.")
    #         return
        
    #     for topic, keywords in self.topic_dict.items():
    #         text = ' '.join(keywords)
            
    #         wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
    #         plt.figure(figsize=(10, 5))
    #         plt.imshow(wordcloud, interpolation='bilinear')
    #         plt.axis("off")
    #         plt.title(f"Word Cloud for Topic: {topic}")
    #         plt.show()

In [22]:
class DF_Dataset(Dataset):
    def parse(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Method to preprocess text data from a DataFrame.

        Args:
        df (pd.DataFrame): DataFrame to parse and preprocess.

        Returns:
        pd.DataFrame: Processed DataFrame.
        """
        print("Parsing DataFrame")

        if "title" in df.columns:
            df["content"] = df["content"].apply(translate_text)
            df["content"] = df["content"].replace("", np.nan)
            df["content"] = df["content"].replace("[deleted]", np.nan)

            df["title"] = df["title"].apply(translate_text)
            df["title"] = df["title"].replace("", np.nan)
            df["title"] = df["title"].replace("[deleted]", np.nan)
            
            df.dropna(inplace=True)
            df = df.reset_index(drop=True)

            df["content"] = df["title"] + " " + df["content"]
            df = df.drop(columns=['title', 'username', 'commentCount', 'score', 'subreddit'])

        else:
            df["content"] = df["content"].apply(translate_text)
            df["content"] = df["content"].replace("", np.nan)
            df["content"] = df["content"].replace("[deleted]", np.nan)
            
            df.dropna(inplace=True)
            df = df.reset_index(drop=True)

            df = df.drop(columns=['username', 'score', 'post_id', 'parent_id'])

        df["content"] = df["content"].apply(preprocess_text)
        
        print(f"Parsed DataFrame with shape: {df.shape}")
        return df

In [None]:
# posts_df['content'] = posts_df['content'].astype(str)
# posts_df["content"] = posts_df["content"].apply(translate_text)
# posts_df["content"] = posts_df["content"].replace("", np.nan)
# posts_df["content"] = posts_df["content"].replace("[deleted]", np.nan)

# posts_df["title"] = posts_df["title"].apply(translate_text)
# posts_df["title"] = posts_df["title"].replace("", np.nan)
# posts_df["title"] = posts_df["title"].replace("[deleted]", np.nan)

# posts_df = posts_df.dropna(subset=["content", "title"]).reset_index(drop=True)

# posts_df["content"] = posts_df["title"] + " " + posts_df["content"]
# posts_df = posts_df.drop(columns=['title', 'username', 'commentCount', 'score', 'subreddit'])

# posts_df["content"] = posts_df["content"].apply(preprocess_text)

In [None]:
# comments_df['content'] = comments_df['content'].astype(str)
# comments_df["content"] = comments_df["content"].apply(translate_text)
# comments_df["content"] = comments_df["content"].replace("", np.nan)
# comments_df["content"] = comments_df["content"].replace("[deleted]", np.nan)

# comments_df = comments_df.dropna(subset=["content"]).reset_index(drop=True)

# comments_df = comments_df.drop(columns=['username', 'score', 'post_id', 'parent_id'])

# comments_df["content"] = comments_df["content"].apply(preprocess_text)

In [None]:
# df = pd.concat([posts_df, comments_df], ignore_index=True)

In [None]:
# df.head()

In [23]:
codes = comments_df['Code'].unique()
for code in codes:
    posts_data = DF_Dataset(posts_df[posts_df['Code'] == code].copy(), dataset_name=f"{code}_posts")
    posts_data.data.to_csv(f"{code}_posts.csv", index=False)

    comments_data = DF_Dataset(comments_df[comments_df['Code'] == code].copy(), dataset_name=f"{code}_comments")
    comments_data.data.to_csv(f"{code}_comments.csv", index=False)

Parsing DataFrame
Parsed DataFrame with shape: (186, 4)
Preparing dataset for ABSA...
Preparing vectorizer...
Vectorizer saved as AA_posts_vectorizer.pkl
Preparing LDA model...
LDA model saved as AA_posts_lda_model.pkl
Topic dictionary saved as AA_posts_topic_dict.pkl
Preparing VADER model...
VADER model saved as AA_posts_vader_model.pkl
Dataset prepared for ABSA
Performing ABSA...
Extracting aspects...
Extracting aspects
Getting sentiment...
ABSA completed
Parsing DataFrame
Parsed DataFrame with shape: (400, 4)
Preparing dataset for ABSA...
Preparing vectorizer...
Vectorizer saved as AA_comments_vectorizer.pkl
Preparing LDA model...
LDA model saved as AA_comments_lda_model.pkl
Topic dictionary saved as AA_comments_topic_dict.pkl
Preparing VADER model...
VADER model saved as AA_comments_vader_model.pkl
Dataset prepared for ABSA
Performing ABSA...
Extracting aspects...
Extracting aspects
Getting sentiment...
ABSA completed
Parsing DataFrame
Parsed DataFrame with shape: (1387, 4)
Prepari

In [None]:
# import pyarrow.parquet as pq
# table1 = pq.read_table('part-00000-bd8a5369-965d-4d67-a1c7-1f30a77cb6d3-c000.snappy.parquet')
# df1 = table1.to_pandas()
# df1

In [None]:
table2 = pq.read_table('part-00000-a3621429-5a6f-47f9-bc59-35e72306ebed-c000.snappy.parquet')
df2 = table2.to_pandas()
df2 = df2[sorted(df2.columns)]
df2.dtypes

In [None]:
skytrax_df = load_json("skytrax/reviews/")

In [None]:
skytrax_df.shape

In [None]:
skytrax_df.head()

In [None]:
skytrax_df = skytrax_df.drop_duplicates(subset=["airline", "username", "title", "publishedDate"], keep="first")

In [None]:
skytrax_df.shape

In [None]:
skytrax_df['airline'].unique()

In [None]:
skytrax_airlines = {
    'southwest-airlines': 'WN', 
    'american-airlines': 'AA',
    'delta-air-lines': 'DL',
    'hawaiian-airlines': 'HA',
    'frontier-airlines': 'F9'
}

In [None]:
skytrax_df['Code'] = skytrax_df['airline'].map(skytrax_airlines)

In [None]:
skytrax_df['publishedDate'] = skytrax_df['publishedDate'].apply(lambda x: re.sub(r'(\d+)(st|nd|rd|th)', r'\1', x))
skytrax_df['publishedDate'] = pd.to_datetime(skytrax_df['publishedDate'], errors='coerce')
skytrax_df['publishedDate'] = skytrax_df['publishedDate'].dt.strftime('%Y-%m-%d %H:%M:%S')

In [None]:
skytrax_df["title"] = skytrax_df["title"].apply(translate_text)
skytrax_df["title"] = skytrax_df["title"].replace("", np.nan)
skytrax_df["title"] = skytrax_df["title"].replace("[deleted]", np.nan)

skytrax_df["review"] = skytrax_df["review"].apply(translate_text)
skytrax_df["review"] = skytrax_df["review"].replace("", np.nan)
skytrax_df["review"] = skytrax_df["review"].replace("[deleted]", np.nan)

In [None]:
skytrax_df.dropna(inplace=True)
skytrax_df = skytrax_df.reset_index(drop=True)

In [None]:
skytrax_df

In [None]:
# data.generate_word_clouds()

In [None]:
# def invoke_claimbuster_api(input_claim):
#     try:
#         api_response = requests.get(
#             url=f"https://idir.uta.edu/claimbuster/api/v2/score/text/{input_claim}", headers={"x-api-key": os.environ.get('CLAIMBUSTER_API_KEY')})
#         data = api_response.json()
#         if data["results"]:
#             return data["results"][0]["score"]
#         return 0
#     except Exception as e:  
#         print(f"An error occurred: {e}")
#         return 0

In [None]:
# posts_df['claimScore'] = posts_df.content.apply(invoke_claimbuster_api)

In [None]:
# comments_df['claimScore'] = comments_df.content.apply(invoke_claimbuster_api)

In [None]:
s3 = boto3.client('s3')
try:
    s3.put_object(
        Bucket='is459-project-output-data', 
        Key=f'reddit/posts/reddit_final_posts_{datetime.utcnow().strftime("%Y-%m-%d)}.csv',
        Body=json.dumps(posts),
        ContentType='application/json'
    )
    print("Files uploaded to S3 successfully")
except Exception as e:
    print("Error uploading to S3: ", e)