In [None]:
import pandas as pd
import re
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
df = pd.read_csv('Data/reddit_data.csv')

In [None]:
df.head()

Unnamed: 0,Subreddit,Title,Score,URL,Comments
0,health,Medical residents are starting to avoid states...,427,https://www.npr.org/sections/health-shots/2024...,They're not the only ones avoiding those states.
1,health,Medical residents are starting to avoid states...,427,https://www.npr.org/sections/health-shots/2024...,> The AAMC analysis notes that even in states ...
2,health,Medical residents are starting to avoid states...,427,https://www.npr.org/sections/health-shots/2024...,"I wouldn’t want to work in an authoritarian, b..."
3,health,Medical residents are starting to avoid states...,427,https://www.npr.org/sections/health-shots/2024...,it makes sense that doctors would want to prac...
4,health,Medical residents are starting to avoid states...,427,https://www.npr.org/sections/health-shots/2024...,"Oh, look, Telling people they might lose their..."


In [None]:
# Define a function for text preprocessing
def preprocess_text(text):
    # Remove emojis
    text = text.encode('ascii', 'ignore').decode('ascii')

    # Remove punctuation
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)

    # Remove links
    text = re.sub(r'https\S+', '', text)

    # Tokenize the text
    tokens = word_tokenize(text)

    # Convert tokens to lowercase
    tokens = [token.lower() for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Join tokens back into a single string
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

In [None]:
df['Comments'] = df['Comments'].apply(preprocess_text)

In [None]:
# Perform sentiment analysis on preprocessed comments
comment_sentiments = []
for comments in df['Comments']:
    blob = TextBlob(comments)
    polarity = blob.sentiment.polarity
    subjectivity = blob.sentiment.subjectivity
    comment_sentiments.append((polarity, subjectivity))

In [None]:
df['Comment_Polarity'], df['Comment_Subjectivity'] = zip(*comment_sentiments)
df.to_csv('Results/reddit_data_with_sentiment.csv', index=False)

print("Sentiment analysis completed and data saved successfully!")

Sentiment analysis completed and data saved successfully!


In [None]:
# Count the number of comments with positive, neutral, and negative opinions
positive_comments = df[df['Comment_Polarity'] > 0].shape[0]
neutral_comments = df[df['Comment_Polarity'] == 0].shape[0]
negative_comments = df[df['Comment_Polarity'] < 0].shape[0]

# Print the results
print("Number of comments with positive opinions:", positive_comments)
print("Number of comments with neutral opinions:", neutral_comments)
print("Number of comments with negative opinions:", negative_comments)

Number of comments with positive opinions: 10467
Number of comments with neutral opinions: 6043
Number of comments with negative opinions: 4974


In [None]:
!pip install openai

Collecting openai
  Downloading openai-1.28.1-py3-none-any.whl (320 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/320.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━[0m [32m235.5/320.1 kB[0m [31m7.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.1/320.1 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-

In [1]:
! pip install openai==0.28

Collecting openai==0.28
  Downloading openai-0.28.0-py3-none-any.whl (76 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.5/76.5 kB[0m [31m901.9 kB/s[0m eta [36m0:00:00[0m
Installing collected packages: openai
Successfully installed openai-0.28.0
