In [7]:
# ---------- imports ----------
import pandas as pd, numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler, LabelEncoder

# ---------- path setup ----------
csv_path = Path(r"C:\Users\SIMBY\Documents\climate_nasa.csv")

# ---------- load ----------
df = pd.read_csv(csv_path)
print("Shape:", df.shape)
display(df.head())


Shape: (522, 5)


Unnamed: 0,date,likesCount,profileName,commentsCount,text
0,2022-09-07T17:12:32.000Z,2,4dca617d86b3fdce80ba7e81fb16e048c9cd9798cdfd6d...,,Neat comparison I have not heard it before.\n ...
1,2022-09-08T14:51:13.000Z,0,518ab97f2d115ba5b6f03b2fba2ef2b120540c9681288b...,,An excellent way to visualise the invisible! T...
2,2022-09-07T17:19:41.000Z,1,d82e8e24eb633fd625b0aef9b3cb625cfb044ceb8483e1...,3.0,Does the CO2/ghg in the troposphere affect the...
3,2022-09-08T00:51:30.000Z,4,37a509fa0b5177a2233c7e2d0e2b2d6916695fa9fba3f2...,,excellent post! I defo feel the difference - o...
4,2022-09-07T19:06:20.000Z,16,e54fbbd42a729af9d04d9a5cc1f9bbfe8081a31c219ecb...,26.0,"Yes, and carbon dioxide does not harm the Eart..."


In [8]:
# ---------- missing values ----------
print("Missing values before:\n", df.isna().sum())

df['text'] = df['text'].fillna("")  # replace missing text with empty
df['profileName'] = df['profileName'].fillna("Unknown")

# fill numeric with 0
df['likesCount'] = df['likesCount'].fillna(0)
df['commentsCount'] = df['commentsCount'].fillna(0)

print("Missing values after:\n", df.isna().sum())


Missing values before:
 date               0
likesCount         0
profileName        0
commentsCount    278
text              18
dtype: int64
Missing values after:
 date             0
likesCount       0
profileName      0
commentsCount    0
text             0
dtype: int64


In [9]:
# ---------- date features ----------
df['date'] = pd.to_datetime(df['date'], errors='coerce')

df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['weekday'] = df['date'].dt.weekday

display(df[['date','year','month','day','weekday']].head())


Unnamed: 0,date,year,month,day,weekday
0,2022-09-07 17:12:32+00:00,2022,9,7,2
1,2022-09-08 14:51:13+00:00,2022,9,8,3
2,2022-09-07 17:19:41+00:00,2022,9,7,2
3,2022-09-08 00:51:30+00:00,2022,9,8,3
4,2022-09-07 19:06:20+00:00,2022,9,7,2


In [10]:
# ---------- encode categorical ----------
encoder = LabelEncoder()
df['profile_encoded'] = encoder.fit_transform(df['profileName'])

display(df[['profileName','profile_encoded']].head())


Unnamed: 0,profileName,profile_encoded
0,4dca617d86b3fdce80ba7e81fb16e048c9cd9798cdfd6d...,146
1,518ab97f2d115ba5b6f03b2fba2ef2b120540c9681288b...,156
2,d82e8e24eb633fd625b0aef9b3cb625cfb044ceb8483e1...,410
3,37a509fa0b5177a2233c7e2d0e2b2d6916695fa9fba3f2...,106
4,e54fbbd42a729af9d04d9a5cc1f9bbfe8081a31c219ecb...,435


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

# ---------- text features ----------
tfidf = TfidfVectorizer(max_features=500, stop_words='english')
X_text = tfidf.fit_transform(df['text'])

print("TF-IDF shape:", X_text.shape)


TF-IDF shape: (522, 500)


In [12]:
from scipy.sparse import hstack

# choose target variable (likesCount for now)
target = "likesCount"

X_meta = df[['commentsCount','year','month','day','weekday','profile_encoded']]
y = df[target]

# combine text + meta features
X_final = hstack([X_text, X_meta.values])

print("Final feature matrix:", X_final.shape)
print("Target shape:", y.shape)


Final feature matrix: (522, 506)
Target shape: (522,)


In [13]:
# save only metadata + target (text is TF-IDF, saved separately if needed)
processed_path = Path("data/processed/social_processed.csv")
processed_path.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(processed_path, index=False)

print("Processed dataset saved:", processed_path)


Processed dataset saved: data\processed\social_processed.csv
