<a href="https://colab.research.google.com/github/susannelobo/Alexis/blob/main/DiseaseOutbreakPrediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import pandas as pd
import torch
import joblib
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import xgboost as xgb
from google.colab import drive
from google.colab import files



drive.mount('/content/drive')


DATA_DIR = "/content/drive/MyDrive/covid_data"
uploaded = files.upload()


confirmed_df = pd.read_csv('time_series_covid19_confirmed_global.csv')
population_df = pd.read_csv('population.csv')


confirmed_grouped = confirmed_df.groupby('Country/Region').sum(numeric_only=True).drop(columns=['Lat', 'Long'])


population_2020_df = population_df[population_df['Year'] == 2020]
population_2020_df = population_2020_df.rename(columns={"Country Name": "Country/Region", "Value": "Population Density"})

filtered_countries = ['India']
merged_df = confirmed_grouped.loc[filtered_countries]


train_data = [f"On {date}, the number of confirmed COVID-19 cases in India is {merged_df.loc['India', date]}."
              for date in merged_df.columns]
train_df = pd.DataFrame(train_data, columns=['text'])


train_texts, val_texts = train_test_split(train_df['text'].tolist(), test_size=0.2, random_state=42)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-125M")
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-neo-125M").to(device)


def tokenize_data(texts):
    return tokenizer(texts, max_length=128, padding='max_length', truncation=True, return_tensors='pt')

train_tokenized = tokenize_data(train_texts)
val_tokenized = tokenize_data(val_texts)


class DatePredictionDataset(Dataset):
    def __init__(self, input_ids, attention_masks):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
    def __len__(self):
        return len(self.input_ids)
    def __getitem__(self, idx):
        return { 'input_ids': self.input_ids[idx], 'attention_mask': self.attention_masks[idx], 'labels': self.input_ids[idx] }


train_dataset = DatePredictionDataset(train_tokenized['input_ids'], train_tokenized['attention_mask'])
val_dataset = DatePredictionDataset(val_tokenized['input_ids'], val_tokenized['attention_mask'])


training_args = TrainingArguments(
    output_dir="/content/results", num_train_epochs=1, per_device_train_batch_size=2, per_device_eval_batch_size=2,
    warmup_steps=500, weight_decay=0.01, logging_dir="/content/logs", logging_steps=10, evaluation_strategy="steps",
    eval_steps=50, save_steps=50, save_total_limit=3, load_best_model_at_end=True, metric_for_best_model="loss",
    fp16=True
)


trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset)
trainer.train()


MODEL_SAVE_PATH = "/content/drive/MyDrive/trained_llm"
os.makedirs(MODEL_SAVE_PATH, exist_ok=True)
model.save_pretrained(MODEL_SAVE_PATH)
tokenizer.save_pretrained(MODEL_SAVE_PATH)


X_features = [tokenizer.encode(text, max_length=128, padding='max_length', truncation=True) for text in train_texts]
X_features = torch.tensor(X_features).numpy()
y_labels = merged_df.iloc[:, -len(X_features):].values.flatten()


xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.05)
xgb_model.fit(X_features, y_labels)


XGB_SAVE_PATH = "/content/drive/MyDrive/xgb_covid_model.pkl"
joblib.dump(xgb_model, XGB_SAVE_PATH)

print("✅ Training completed! Models saved in Google Drive.")

Mounted at /content/drive


Saving My_Secret.txt to My_Secret.txt
Saving population.csv to population.csv
Saving time_series_covid19_confirmed_global.csv to time_series_covid19_confirmed_global.csv


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/526M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mruchirchitre[0m ([33mruchirchitre-don-bosco-institute-of-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss
50,1.3204,0.645366
100,0.2154,0.205682
150,0.2013,0.199523
200,0.2001,0.195042
250,0.1958,0.199699
300,0.1934,0.196446
350,0.2005,0.197755
400,0.1865,0.192685


Step,Training Loss,Validation Loss
50,1.3204,0.645366
100,0.2154,0.205682
150,0.2013,0.199523
200,0.2001,0.195042
250,0.1958,0.199699
300,0.1934,0.196446
350,0.2005,0.197755
400,0.1865,0.192685
450,0.1918,0.194372


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


✅ Training completed! Models saved in Google Drive.


In [None]:
from google.colab import files
uploaded = files.upload()
import os

# Read the secret file
with open('/content/drive/My Drive/Colab Notebooks/My_Secret.txt') as f:
    for line in f:
        key, value = line.strip().split('=')
        os.environ[key] = value

# Test it
print(os.environ['MY_SECRET_KEY'])


In [None]:
import os
import torch
import joblib
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM
import xgboost as xgb

# ✅ Load saved LLM model and tokenizer from Google Drive
MODEL_PATH = "/content/drive/MyDrive/trained_llm"
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForCausalLM.from_pretrained(MODEL_PATH).to("cuda" if torch.cuda.is_available() else "cpu")

# ✅ Load saved XGBoost model from Google Drive
XGB_MODEL_PATH = "/content/drive/MyDrive/xgb_covid_model.pkl"
xgb_model = joblib.load(XGB_MODEL_PATH)

# ✅ Define a test date
test_date = "2021-07-01"
test_prompt = f"On {test_date}, the number of confirmed COVID-19 cases in India is "

# ✅ Generate prediction using LLM
inputs = tokenizer(test_prompt, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")
output = model.generate(**inputs, max_length=50)
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

print("🔹 LLM Prediction:", generated_text)

# ✅ Prepare input features for XGBoost model
x_test_features = tokenizer.encode(test_prompt, max_length=128, padding="max_length", truncation=True)
x_test_features = torch.tensor(x_test_features).unsqueeze(0).numpy()

# ✅ Predict using XGBoost
xgb_prediction = xgb_model.predict(x_test_features)[0]

print(f"🔹 XGBoost Predicted Cases for {test_date}: {int(xgb_prediction)}")

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import ipywidgets as widgets
from IPython.display import display

# Simulated daily case data (replace with real if needed)
np.random.seed(0)
days_2020 = pd.date_range(start="2020-01-01", end="2020-12-31")
daily_cases = train.df

df_daily = pd.DataFrame({
    "Date": days_2020,
    "Daily_Cases": daily_cases
})
df_daily["Month"] = df_daily["Date"].dt.month
monthly_cases = df_daily.groupby("Month")["Daily_Cases"].sum().reset_index()
monthly_cases.columns = ["Month", "Monthly_Cases"]

# Model training on 2020 data
X = monthly_cases[["Month"]]
y = monthly_cases["Monthly_Cases"]

pipeline = make_pipeline(
    StandardScaler(),
    LinearRegression()
)
pipeline.fit(X, y)

# Widget to enter multiple years
year_input = widgets.Text(
    value='2021,2022',
    placeholder='Enter years separated by commas',
    description='Years:',
    disabled=False
)
display(year_input)

def on_year_submit(change):
    input_text = year_input.value
    try:
        years = [int(y.strip()) for y in input_text.split(',') if y.strip().isdigit()]
    except:
        print("Please enter years as comma-separated values like: 2021,2022")
        return

    plt.figure(figsize=(12, 6))

    for year in years:
        months = np.arange(1, 13)
        month_df = pd.DataFrame({"Month": months})
        predicted_cases = pipeline.predict(month_df)

        plt.plot(months, predicted_cases, marker='o', label=f"{year}")

    plt.title("Predicted Monthly COVID-19 Cases")
    plt.xlabel("Month")
    plt.ylabel("Predicted Cases")
    plt.xticks(np.arange(1, 13), [
        'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
        'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'
    ])
    plt.grid(True)
    plt.legend(title="Year")
    plt.tight_layout()
    plt.show()

year_input.observe(on_year_submit, names='value')


NameError: name 'train' is not defined

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from IPython.display import display
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score



# ---- Process your dataframe ----
# Assume df_cases is your preprocessed DataFrame with datetime index
df_cases = df_cases.copy()
df_cases.index = pd.to_datetime(df_cases.index, format='%m/%d/%y')
df_cases.loc[:, "Month"] = df_cases.index.month

monthly_cases = df_cases.groupby("Month").sum().sum(axis=1).reset_index()
monthly_cases.columns = ["Month", "Monthly_Cases"]

# ---- Train your model ----
X = monthly_cases[["Month"]]
y = monthly_cases["Monthly_Cases"]

pipeline = make_pipeline(
    StandardScaler(),
    LinearRegression()
)
pipeline.fit(X, y)

# ---- User Interface ----
year_input = widgets.Text(
    value='2021,2022',
    placeholder='Enter years separated by commas',
    description='Years:',
    disabled=False
)

submit_button = widgets.Button(description="Predict & Plot")
output = widgets.Output()

display(widgets.HBox([year_input, submit_button]), output)

# ---- Callback ----
def on_button_click(b):
    with output:
        output.clear_output()
        input_text = year_input.value
        try:
            years = [int(y.strip()) for y in input_text.split(',') if y.strip().isdigit()]
        except:
            print("Please enter valid years like: 2021,2022")
            return

        plt.figure(figsize=(12, 6))
        max_pred = 0

        for year in years:
            months = np.arange(1, 13)
            month_df = pd.DataFrame({"Month": months})
            predicted_cases = pipeline.predict(month_df)
            max_pred = max(max_pred, predicted_cases.max())

            plt.plot(months, predicted_cases, marker='o', label=f"{year}")

        plt.title("Predicted Monthly COVID-19 Cases")
        plt.xlabel("Month")
        plt.ylabel("Predicted Cases")
        plt.xticks(np.arange(1, 13), [
            'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
            'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'
        ])
        plt.ylim(0, max_pred * 1.1)  # add a little padding above max
        plt.grid(True)
        plt.legend(title="Year")
        plt.tight_layout()
        plt.show()
        y_pred_2020 = pipeline.predict(X)

        # Metrics
        mae = mean_absolute_error(y, y_pred_2020)
        mse = mean_squared_error(y, y_pred_2020)
        rmse = np.sqrt(mse)
        r2 = r2_score(y, y_pred_2020)

        # Print metrics
        print("🔍 Model Accuracy on 2020 (Training Data):")
        print(f"MAE  = {mae:.2f} cases")
        print(f"MSE  = {mse:.2f}")
        print(f"RMSE = {rmse:.2f} cases")
        print(f"R²   = {r2:.3f}")



submit_button.on_click(on_button_click)

NameError: name 'df_cases' is not defined

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import ipywidgets as widgets
from IPython.display import display, Markdown

# --- UI Header ---
display(Markdown("""
# 🔬 COVID-19 Outbreak Prediction Project
Understanding and predicting disease outbreaks is vital for preparing healthcare systems, informing the public, and guiding policy decisions. This project focuses on predicting COVID-19 cases in India, helping support public health planning and social good through timely insights.
"""))

# --- Load and Prepare Data ---
confirmed_df = pd.read_csv('time_series_covid19_confirmed_global.csv')
population_df = pd.read_csv('population.csv')

# Aggregate data by country
confirmed_grouped = confirmed_df.groupby('Country/Region').sum(numeric_only=True).drop(columns=['Lat', 'Long'])

# Filter only India data
filtered_countries = ['India']
merged_df = confirmed_grouped.loc[filtered_countries].T
merged_df.columns = ['Confirmed']

# Specify the datetime format to suppress warnings
merged_df.index = pd.to_datetime(merged_df.index, format='%m/%d/%y', errors='coerce')
merged_df = merged_df.dropna()
merged_df['Month'] = merged_df.index.month

monthly_cases = merged_df.groupby("Month")["Confirmed"].max().diff().fillna(merged_df.groupby("Month")["Confirmed"].max()).reset_index()
monthly_cases.columns = ["Month", "Monthly_Cases"]

# --- Train Model ---
X = monthly_cases[["Month"]]
y = monthly_cases["Monthly_Cases"]

pipeline = make_pipeline(
    StandardScaler(),
    LinearRegression()
)
pipeline.fit(X, y)

# --- Widgets ---
date_input = widgets.Text(
    value='2021-01-12',
    placeholder='Enter date (YYYY-MM-DD)',
    description='Date:'
)

year_input = widgets.Text(
    value='2021,2022',
    placeholder='Enter years separated by commas',
    description='Years:'
)

submit_button = widgets.Button(description="Predict")
output = widgets.Output()

ui = widgets.VBox([date_input, year_input, submit_button, output])
display(ui)

# --- Callback ---
def on_button_click(b):
    with output:
        output.clear_output()
        # --- Date Prediction ---
        try:
            input_date = pd.to_datetime(date_input.value)
            month = input_date.month
            month_df = pd.DataFrame({"Month": [month]})
            month_df.columns = X.columns  # Align column names
            predicted = pipeline.predict(month_df)[0]
            display(Markdown(f"\n🗓️ On **{input_date.date()}**, the predicted number of cases in India is approximately **{int(predicted):,}**."))
        except Exception as e:
            print("Invalid date. Please use the format YYYY-MM-DD.", str(e))

        # --- Yearly Graphs ---
        try:
            years = [int(y.strip()) for y in year_input.value.split(',') if y.strip().isdigit()]
            plt.figure(figsize=(12, 6))
            max_pred = 0
            for year in years:
                months = np.arange(1, 13)
                month_df = pd.DataFrame({"Month": months})
                month_df.columns = X.columns  # Align column names
                predicted_cases = pipeline.predict(month_df)
                max_pred = max(max_pred, predicted_cases.max())
                plt.plot(months, predicted_cases, marker='o', label=str(year))

            plt.title("Predicted Monthly COVID-19 Cases in India")
            plt.xlabel("Month")
            plt.ylabel("Predicted Cases")
            plt.xticks(np.arange(1, 13), ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
                                          'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
            plt.ylim(0, max_pred * 1.1)
            plt.grid(True)
            plt.legend(title="Year")
            plt.tight_layout()
            plt.show()

            # --- Metrics ---
            y_pred = pipeline.predict(X)
            mae = mean_absolute_error(y, y_pred)
            mse = mean_squared_error(y, y_pred)
            rmse = np.sqrt(mse)
            r2 = r2_score(y, y_pred)

            display(Markdown(f"""
            ### 🔍 Model Accuracy on 2020 Training Data:
            - **MAE**  = {mae:.2f} cases
            - **MSE**  = {mse:.2f}
            - **RMSE** = {rmse:.2f} cases
            - **R²**   = {r2:.3f}
            """))
        except Exception as e:
            print("Year input error:", str(e))

submit_button.on_click(on_button_click)



# 🔬 COVID-19 Outbreak Prediction Project
Understanding and predicting disease outbreaks is vital for preparing healthcare systems, informing the public, and guiding policy decisions. This project focuses on predicting COVID-19 cases in India, helping support public health planning and social good through timely insights.


VBox(children=(Text(value='2021-01-12', description='Date:', placeholder='Enter date (YYYY-MM-DD)'), Text(valu…