In [None]:
!pip install deep_translator
!pip install nltk

In [22]:
import boto3
from datetime import datetime
from dotenv import load_dotenv
import numpy as np
import nltk 
nltk.download('punkt')
nltk.download('punkt_tab')
import os
import pandas as pd
import requests

from deep_translator import GoogleTranslator
from nltk.tokenize import sent_tokenize 

[nltk_data] Downloading package punkt to /Users/xuanli/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/xuanli/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [23]:
load_dotenv()

True

In [24]:
posts_df = pd.read_json('reddit_posts.json')
comments_df = pd.read_json('reddit_comments.json')

In [25]:
posts_df = posts_df.replace("", np.nan)
posts_df.dropna(inplace=True)

comments_df = comments_df.replace("", np.nan)
comments_df.dropna(inplace=True)

In [26]:
airlines = {
    'SouthwestAirlines': 'WN', 
    'Southwest_Airlines': 'WN', 
    'AmericanAir': 'AA',
    'DeltaAirlines': 'DL',
    'HawaiianAirlines': 'HA',
    'frontierairlines': 'F9'
}

In [27]:
posts_df['Code'] = posts_df['subreddit'].map(airlines)

In [28]:
translator = GoogleTranslator(source='auto', target='english')

In [29]:
def chunk_text(text, max_length=5000):
    chunks = []
    while len(text) > max_length:
        split_index = text[:max_length].rfind(' ')
        if split_index == -1:
            split_index = max_length
        chunks.append(text[:split_index])
        text = text[split_index:].strip()
    chunks.append(text)
    return chunks

def translate_text(text):
    try:
        if len(text) > 5000:
            chunks = chunk_text(text)
            translated_chunks = [translator.translate(chunk) for chunk in chunks]
            return ' '.join(translated_chunks)
        else:
            return translator.translate(text)
    except Exception as e:
        print(f"Error translating text: {e}")
        return text

In [9]:
posts_df['content'] = posts_df.content.apply(translate_text)

posts_df["content"] = posts_df["content"].replace("", np.nan)
posts_df["content"] = posts_df["content"].replace("[deleted]", np.nan)
posts_df.dropna(inplace=True)

In [30]:
comprehend = boto3.client('comprehend')

In [31]:
def detect_sentiment(text):
    try:
        result = comprehend.detect_sentiment(Text=text, LanguageCode='en')
        return result['Sentiment']
    except:
        return "UNKNOWN"

In [32]:
posts_df['sentiment'] = posts_df.content.apply(detect_sentiment)

In [33]:
posts_df.to_csv("posts_sentiment.csv")

In [34]:
comments_df['sentiment'] = comments_df.content.apply(detect_sentiment)

In [35]:
comments_df.to_csv("comments_sentiment.csv")

In [16]:
def invoke_claimbuster_api(input_claim):
    try:
        api_response = requests.get(
            url=f"https://idir.uta.edu/claimbuster/api/v2/score/text/{input_claim}", headers={"x-api-key": os.environ.get('CLAIMBUSTER_API_KEY')})
        data = api_response.json()
        if data["results"]:
            return data["results"][0]["score"]
        return 0
    except Exception as e:  
        print(f"An error occurred: {e}")
        return 0

In [32]:
posts_df['claimScore'] = posts_df.content.apply(invoke_claimbuster_api)

In [17]:
comments_df['claimScore'] = comments_df.content.apply(invoke_claimbuster_api)

An error occurred: Expecting value: line 1 column 1 (char 0)


In [36]:
posts_df

Unnamed: 0,id,date,title,content,username,commentCount,score,subreddit,Code,sentiment
0,absdf,2019-08-03 11:27:29,中国产C919飞机有一大缺点 难获国际市场,中国C919飞机原定于去年年底交付。虽然在今年5月完成了即将交付用户的首架飞机的试飞，但还没...,xl,0,0,SouthwestAirlines,WN,NEUTRAL
1,1g7qgpv,2024-10-20 12:04:51,Question/Help,"Hey everyone, I know it’s a long shot, but I t...",Kmelloww,4,0,SouthwestAirlines,WN,NEGATIVE
2,1g7plv5,2024-10-20 11:12:20,Airpods left on plane,I left my airpods on a recent flight and I can...,Ph4Nt0M218,6,3,SouthwestAirlines,WN,NEGATIVE
4,1g7o4ce,2024-10-20 09:44:42,"If I book 4 tickets together and I’m A-List, w...",I know all the tickets in my booking get my bo...,DoubleJob6790,3,3,SouthwestAirlines,WN,NEUTRAL
5,1g7nqmh,2024-10-20 09:23:06,"Elliott, Southwest Airlines to begin settlemen...",Interesting,Even-Compote2602,9,20,SouthwestAirlines,WN,POSITIVE
...,...,...,...,...,...,...,...,...,...,...
513,1fwpz6b,2024-10-05 21:06:15,10000 Miles Promo,\nWill the current 10000 Miles promo be eligib...,MayorShinn,2,1,frontierairlines,F9,NEUTRAL
514,1fwjhgn,2024-10-05 13:40:18,1 hr 46 min layover in Denver -- Yes or No?,EDIT: thanks for the reassurance everyone! i b...,carrotsoup3,29,3,frontierairlines,F9,MIXED
515,1fwesa2,2024-10-05 09:03:36,Frontier airlines app issues,"I booked a flight & am trying to log in, but t...",AnxietyDue61,1,1,frontierairlines,F9,NEGATIVE
516,1fwelej,2024-10-05 08:53:51,No GWP flights,I live in Las Vegas and have the Go Wild Pass....,FantasticScratch5719,5,1,frontierairlines,F9,NEGATIVE


In [37]:
comments_df

Unnamed: 0,id,date,content,username,score,post_id,parent_id,sentiment
0,lssnxhc,2024-10-20 12:26:19,"Where is your origin city, if you are comfort...",NotMyCircuits,1,1g7qgpv,t3_1g7qgpv,NEUTRAL
1,lssphm7,2024-10-20 12:39:02,I would be flying out of RDU. I’m used to adva...,Kmelloww,1,1g7qgpv,t1_lssnxhc,NEUTRAL
2,lssqux4,2024-10-20 12:50:30,Do you need to fly?\nThis site compares ways t...,NotMyCircuits,1,1g7qgpv,t1_lssphm7,NEUTRAL
3,lssr02n,2024-10-20 12:51:42,I’m about to check that out now. I know I’m mo...,Kmelloww,1,1g7qgpv,t1_lssqux4,NEGATIVE
4,lsshz8p,2024-10-20 11:41:16,A. Good luck.\n\nB. Start the insurance claim ...,AshDenver,2,1g7plv5,t3_1g7plv5,NEGATIVE
...,...,...,...,...,...,...,...,...
4438,lqek4vy,2024-10-05 11:10:21,I actually did provide them the CVC code 😅 whi...,WolverineHelpful9775,3,1fwe0df,t1_lqehl6f,NEGATIVE
4439,lqgcrit,2024-10-05 21:38:55,I have delt with them through we chat it’s nor...,SubjectGoal3565,1,1fwe0df,t1_lqdzeu9,NEUTRAL
4440,lqekg3b,2024-10-05 11:12:44,OMG! I would never share CVC code to a human. ...,shaadmaan_icekid,2,1fwe0df,t1_lqek4vy,NEGATIVE
4441,lqekqhb,2024-10-05 11:14:58,Yeah idk what I was thinking. The red flag did...,WolverineHelpful9775,3,1fwe0df,t1_lqekg3b,NEGATIVE


In [44]:
merged_df = comments_df.merge(posts_df, left_on='post_id', right_on='id', how='left')

In [45]:
merged_df = merged_df.drop_duplicates(subset=['post_id', 'username_x'], keep='first')

In [46]:
merged_df = merged_df[merged_df['username_x'] != merged_df['username_y']]

In [48]:
def calculate_matching_percentage(group):
    total_unique_commenters = group['username_x'].nunique()
    group['sentiment_match'] = group['sentiment_x'] == group['sentiment_y']
    matching_sentiments_count = group['sentiment_match'].sum()
    
    if total_unique_users > 0:
        matching_percentage = (matching_sentiments_count / total_unique_users) * 100
    else:
        matching_percentage = 0
    
    return pd.Series({
        'total_unique_commenters': total_unique_commenters,
        'matching_sentiments_count': matching_sentiments_count,
        'matching_percentage': matching_percentage
    })
    
merged_df = merged_df.groupby('post_id').apply(calculate_matching_percentage).reset_index()
merged_df = posts_df.merge(merged_df, left_on='id', right_on='post_id', how='left')

  merged_df = merged_df.groupby('post_id').apply(calculate_matching_percentage).reset_index()


In [49]:
merged_df

Unnamed: 0,id,date,title,content,username,commentCount,score,subreddit,Code,sentiment,post_id,total_unique_users,matching_sentiments_count,matching_percentage
0,absdf,2019-08-03 11:27:29,中国产C919飞机有一大缺点 难获国际市场,中国C919飞机原定于去年年底交付。虽然在今年5月完成了即将交付用户的首架飞机的试飞，但还没...,xl,0,0,SouthwestAirlines,WN,NEUTRAL,,,,
1,1g7qgpv,2024-10-20 12:04:51,Question/Help,"Hey everyone, I know it’s a long shot, but I t...",Kmelloww,4,0,SouthwestAirlines,WN,NEGATIVE,1g7qgpv,1.0,0.0,0.000000
2,1g7plv5,2024-10-20 11:12:20,Airpods left on plane,I left my airpods on a recent flight and I can...,Ph4Nt0M218,6,3,SouthwestAirlines,WN,NEGATIVE,1g7plv5,5.0,1.0,20.000000
3,1g7o4ce,2024-10-20 09:44:42,"If I book 4 tickets together and I’m A-List, w...",I know all the tickets in my booking get my bo...,DoubleJob6790,3,3,SouthwestAirlines,WN,NEUTRAL,1g7o4ce,1.0,0.0,0.000000
4,1g7nqmh,2024-10-20 09:23:06,"Elliott, Southwest Airlines to begin settlemen...",Interesting,Even-Compote2602,9,20,SouthwestAirlines,WN,POSITIVE,1g7nqmh,8.0,1.0,12.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
443,1fwpz6b,2024-10-05 21:06:15,10000 Miles Promo,\nWill the current 10000 Miles promo be eligib...,MayorShinn,2,1,frontierairlines,F9,NEUTRAL,1fwpz6b,2.0,2.0,100.000000
444,1fwjhgn,2024-10-05 13:40:18,1 hr 46 min layover in Denver -- Yes or No?,EDIT: thanks for the reassurance everyone! i b...,carrotsoup3,29,3,frontierairlines,F9,MIXED,1fwjhgn,20.0,3.0,15.000000
445,1fwesa2,2024-10-05 09:03:36,Frontier airlines app issues,"I booked a flight & am trying to log in, but t...",AnxietyDue61,1,1,frontierairlines,F9,NEGATIVE,1fwesa2,1.0,0.0,0.000000
446,1fwelej,2024-10-05 08:53:51,No GWP flights,I live in Las Vegas and have the Go Wild Pass....,FantasticScratch5719,5,1,frontierairlines,F9,NEGATIVE,1fwelej,2.0,0.0,0.000000


In [60]:
s3 = boto3.client('s3')
try:
    s3.put_object(
        Bucket='is459-project-output-data', 
        Key=f'reddit/posts/reddit_final_posts_{datetime.utcnow().strftime("%Y-%m-%d)}.csv',
        Body=json.dumps(posts),
        ContentType='application/json'
    )
    print("Files uploaded to S3 successfully")
except Exception as e:
    print("Error uploading to S3: ", e)

Error uploading to S3:  name 'json' is not defined


In [49]:
posts_df.to_csv("saved.csv")

In [None]:
posts_df.shape

In [None]:
comments_df.shape