In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report, confusion_matrix

from google.colab import files
uploaded = files.upload()

df = pd.read_csv("Raw HIV Data Philippines.csv", skiprows=1)

if "Total" in df.columns:
    df = df.drop(columns=["Total"])

print("Cleaned format:")
print(df.head())

df_long = df.melt(id_vars=["Age Group"], var_name="Year", value_name="Cases")

print("\nLong format preview:")
print(df_long.head(12))

le = LabelEncoder()
df_long["AgeGroupEncoded"] = le.fit_transform(df_long["Age Group"].astype(str))

df_long["Year"] = pd.to_numeric(df_long["Year"], errors="coerce")

df_long.dropna(inplace=True)

scaler = StandardScaler()
X = scaler.fit_transform(df_long[["Year", "AgeGroupEncoded"]])
y = df_long["Cases"]

X_train, X_test, y_train, y_test = train_test_split(
    df_long[["Year", "Cases"]], df_long["AgeGroupEncoded"], test_size=0.3, random_state=42
)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("\nClassification Report (predicting Age Group):")
print(classification_report(y_test, y_pred))

kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(df_long[["Cases", "Year"]])

df_long["Cluster"] = clusters
print("\nCluster distribution:")
print(df_long["Cluster"].value_counts())

plt.scatter(df_long["Year"], df_long["Cases"], c=clusters, cmap="viridis")
plt.xlabel("Year")
plt.ylabel("Cases")
plt.title("KMeans Clustering of HIV Cases by Year")
plt.show()
