In [None]:
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
import matplotlib.pyplot as plt
# Download VADER lexicon for sentiment analysis
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_csv("/content/drive/My Drive/Reddit-EVs/ev_comments.csv")

In [None]:
def sentiment_classification(comment):
    if pd.isna(comment) or str(comment).strip() == "":
        return "Neutral"
    sent_analyze = SentimentIntensityAnalyzer()
    # Get compound sentiment score using VADER
    sent_score = sent_analyze.polarity_scores(str(comment))["compound"]

    # Classify as happy if score >= 0.05, otherwise not happy
    if sent_score >= 0.05:
      return "Positive"
    elif sent_score <  0.05:
      return "Negative"


In [None]:
df['comment_sentiment'] = df['comment_body'].apply(sentiment_classification)

In [None]:
df.head(20)

Unnamed: 0,post_id,post_title,post_created_time,comment_id,comment_body,comment_author,comment_created_time,comment_sentiment
0,1odoh28,Merri-bek is conducting a Home-to-street EV ch...,2025-10-23 10:53:00,nkvd61a,Have you visited today’s **[Daily Discussion](...,AutoModerator,2025-10-23 10:53:01,Positive
1,1odoh28,Merri-bek is conducting a Home-to-street EV ch...,2025-10-23 10:53:00,nkvip8o,Kids are going to hang off that and bend it,mietwad,2025-10-23 11:26:26,Negative
2,1odoh28,Merri-bek is conducting a Home-to-street EV ch...,2025-10-23 10:53:00,nkvgikr,Saw this on Instagram recently from the counci...,delljj,2025-10-23 11:13:15,Negative
3,1odoh28,Merri-bek is conducting a Home-to-street EV ch...,2025-10-23 10:53:00,nkvjy14,"If it's a trial, I imagine those structure may...",Hypo_Mix,2025-10-23 11:33:59,Positive
4,1odoh28,Merri-bek is conducting a Home-to-street EV ch...,2025-10-23 10:53:00,nkvg2bp,Won't be long before the junkies steal the cop...,MarkFromTheInternet,2025-10-23 11:10:31,Negative
5,1odoh28,Merri-bek is conducting a Home-to-street EV ch...,2025-10-23 10:53:00,nkvkmb7,This is a quarter of a Hills Hoist and a 15m e...,blackabbot,2025-10-23 11:38:06,Negative
6,1odoh28,Merri-bek is conducting a Home-to-street EV ch...,2025-10-23 10:53:00,nkvfiay,This is discriminatory against tall people. \n...,Beast_of_Guanyin,2025-10-23 11:07:09,Positive
7,1odoh28,Merri-bek is conducting a Home-to-street EV ch...,2025-10-23 10:53:00,nkvgld6,Surely the answer is to have outlets installed...,zsaleeba,2025-10-23 11:13:43,Positive
8,1odoh28,Merri-bek is conducting a Home-to-street EV ch...,2025-10-23 10:53:00,nkvhnxv,What could possibly go wrong….,Makunouchiipp0,2025-10-23 11:20:11,Negative
9,1odoh28,Merri-bek is conducting a Home-to-street EV ch...,2025-10-23 10:53:00,nkvenip,Looks like shit and given there is plenty of c...,Flaky-Gear-1370,2025-10-23 11:01:56,Negative


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11267 entries, 0 to 11266
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   post_id               11267 non-null  object
 1   post_title            11267 non-null  object
 2   post_created_time     11267 non-null  object
 3   comment_id            11267 non-null  object
 4   comment_body          11267 non-null  object
 5   comment_author        9848 non-null   object
 6   comment_created_time  11267 non-null  object
 7   comment_sentiment     11267 non-null  object
dtypes: object(8)
memory usage: 704.3+ KB


In [None]:
df["post_created_time"] = pd.to_datetime(df["post_created_time"], errors="coerce")
df["comment_created_time"] = pd.to_datetime(df["comment_created_time"], errors="coerce")

In [None]:
df["year"] = df["comment_created_time"].dt.year
df["month"] = df["comment_created_time"].dt.month
df["day"] = df["comment_created_time"].dt.day
df["hour"] = df["comment_created_time"].dt.hour


In [None]:
df["comment_length_chars"] = df["comment_body"].apply(lambda x: len(str(x)))

In [None]:
export_cols = [
    "comment_id", "comment_sentiment",
    "year", "month", "day", "hour",
    "comment_length_chars"
]

In [None]:
final_df = df[export_cols]

In [None]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11267 entries, 0 to 11266
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   comment_id            11267 non-null  object 
 1   comment_sentiment     11267 non-null  object 
 2   year                  11266 non-null  float64
 3   month                 11266 non-null  float64
 4   day                   11266 non-null  float64
 5   hour                  11266 non-null  float64
 6   comment_length_chars  11267 non-null  int64  
dtypes: float64(4), int64(1), object(2)
memory usage: 616.3+ KB


In [None]:
final_df = final_df.dropna()

In [None]:
for i in ['year','month','day','hour']:
  final_df[i] = final_df[i].astype(int)

In [None]:
final_df.to_csv("reddit_preprocessed.csv", index=False, encoding="utf-8-sig")

print("✅ Preprocessed file exported successfully for Power BI!")

✅ Preprocessed file exported successfully for Power BI!
