In [1]:
import os
import pandas as pd

In [5]:

RAW_DIR = os.path.join("..", "data", "raw")

In [6]:
# 1. See exactly what files we have
files = os.listdir(RAW_DIR)
print("Raw files:", files)

# 2. Load each into a DataFrame
df_list = []
for fname in files:
    if fname.lower().endswith(".csv"):
        path = os.path.join(RAW_DIR, fname)
        print(f"Loading {fname} → shape", pd.read_csv(path, nrows=5).shape, "…")
        df = pd.read_csv(path)
        df_list.append((fname, df))

# 3. Unpack them
reddit_df = dict(df_list)["Reddit_Data.csv"]
twitter_df = dict(df_list)["Twitter_Data.csv"]


Raw files: ['Reddit_Data.csv', '.gitkeep', 'Twitter_Data.csv']
Loading Reddit_Data.csv → shape (5, 2) …
Loading Twitter_Data.csv → shape (5, 2) …


In [7]:
print("➤ Reddit data:", reddit_df.shape)
display(reddit_df.head())

print("➤ Twitter data:", twitter_df.shape)
display(twitter_df.head())


➤ Reddit data: (37249, 2)


Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1


➤ Twitter data: (162980, 2)


Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [8]:
print("Reddit info:")
display(reddit_df.info())
print("Missing in Reddit:")
display(reddit_df.isna().sum())

print("\nTwitter info:")
display(twitter_df.info())
print("Missing in Twitter:")
display(twitter_df.isna().sum())


Reddit info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37249 entries, 0 to 37248
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   clean_comment  37149 non-null  object
 1   category       37249 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 582.1+ KB


None

Missing in Reddit:


clean_comment    100
category           0
dtype: int64


Twitter info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162980 entries, 0 to 162979
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   clean_text  162976 non-null  object 
 1   category    162973 non-null  float64
dtypes: float64(1), object(1)
memory usage: 2.5+ MB


None

Missing in Twitter:


clean_text    4
category      7
dtype: int64

In [9]:
# --- Harmonize column names ---
# Rename Twitter’s column to match Reddit’s
twitter_df = twitter_df.rename(columns={"clean_text": "clean_comment"})

# Ensure category is integer in both
reddit_df["category"]  = reddit_df["category"].astype(int)
twitter_df["category"] = twitter_df["category"].fillna(-1).astype(int)

# --- Combine and inspect again ---
df_all = pd.concat([reddit_df, twitter_df], ignore_index=True)
print("Combined shape before clean:", df_all.shape)

# --- Drop rows with missing or placeholder labels ---
# Here we treat category==-1 (from NaNs) as missing
df_all = df_all[df_all["category"] >= 0]
print("After removing missing labels:", df_all.shape)

# --- Drop any rows with missing text ---
df_all = df_all.dropna(subset=["clean_comment"])
print("After dropna text:", df_all.shape)

# --- Optional: drop exact duplicates of text  ---
before_dupe = len(df_all)
df_all = df_all.drop_duplicates(subset=["clean_comment"])
print(f"Dropped {before_dupe - len(df_all)} duplicate rows; now {len(df_all)} rows")

# Inspect label balance
display(df_all["category"].value_counts())


Combined shape before clean: (200229, 2)
After removing missing labels: (156435, 2)
After dropna text: (156332, 2)
Dropped 379 duplicate rows; now 155953 rows


category
1    87992
0    67961
Name: count, dtype: int64

In [10]:
out_fp = os.path.join("..", "data", "processed", "cleaned_data.csv")
df_all.to_csv(out_fp, index=False)
print("Saved cleaned data to", out_fp)


Saved cleaned data to ../data/processed/cleaned_data.csv
