In [1]:
import pandas as pd

# Load the datasets
confirmed_df = pd.read_csv("time_series_covid19_confirmed_global.csv")
deaths_df = pd.read_csv("time_series_covid19_deaths_global.csv")
recovered_df = pd.read_csv("time_series_covid19_recovered_global.csv")

# Inspect the data
print(confirmed_df.head())
print(deaths_df.head())
print(recovered_df.head())


  Province/State Country/Region       Lat       Long  1/22/20  1/23/20  \
0            NaN    Afghanistan  33.93911  67.709953        0        0   
1            NaN        Albania  41.15330  20.168300        0        0   
2            NaN        Algeria  28.03390   1.659600        0        0   
3            NaN        Andorra  42.50630   1.521800        0        0   
4            NaN         Angola -11.20270  17.873900        0        0   

   1/24/20  1/25/20  1/26/20  1/27/20  ...  2/28/23  3/1/23  3/2/23  3/3/23  \
0        0        0        0        0  ...   209322  209340  209358  209362   
1        0        0        0        0  ...   334391  334408  334408  334427   
2        0        0        0        0  ...   271441  271448  271463  271469   
3        0        0        0        0  ...    47866   47875   47875   47875   
4        0        0        0        0  ...   105255  105277  105277  105277   

   3/4/23  3/5/23  3/6/23  3/7/23  3/8/23  3/9/23  
0  209369  209390  209406  2

In [2]:
import pandas as pd


def preprocess_time_series(df, value_name):
    # Melt the DataFrame to long format
    df_melted = pd.melt(
        df,
        id_vars=["Province/State", "Country/Region", "Lat", "Long"],
        var_name="Date",
        value_name=value_name,
    )

    # Convert 'Date' column to datetime
    df_melted["Date"] = pd.to_datetime(df_melted["Date"], format="%m/%d/%y")

    # Sort by date
    df_melted.sort_values(by=["Country/Region", "Province/State", "Date"], inplace=True)

    return df_melted


# Load the datasets
confirmed_df = pd.read_csv("time_series_covid19_confirmed_global.csv")
deaths_df = pd.read_csv("time_series_covid19_deaths_global.csv")
recovered_df = pd.read_csv("time_series_covid19_recovered_global.csv")

# Preprocess each DataFrame
confirmed_df = preprocess_time_series(confirmed_df, "Confirmed")
deaths_df = preprocess_time_series(deaths_df, "Deaths")
recovered_df = preprocess_time_series(recovered_df, "Recovered")

# Print columns to verify
print("Confirmed DataFrame columns:", confirmed_df.columns)
print("Deaths DataFrame columns:", deaths_df.columns)
print("Recovered DataFrame columns:", recovered_df.columns)


Confirmed DataFrame columns: Index(['Province/State', 'Country/Region', 'Lat', 'Long', 'Date', 'Confirmed'], dtype='object')
Deaths DataFrame columns: Index(['Province/State', 'Country/Region', 'Lat', 'Long', 'Date', 'Deaths'], dtype='object')
Recovered DataFrame columns: Index(['Province/State', 'Country/Region', 'Lat', 'Long', 'Date', 'Recovered'], dtype='object')


In [3]:
def merge_in_chunks(df1, df2, chunk_size=5000):
    merged_df = pd.DataFrame()
    for start in range(0, len(df1), chunk_size):
        end = start + chunk_size
        chunk = df1.iloc[start:end]
        try:
            chunk = pd.merge(
                chunk,
                df2,
                on=["Province/State", "Country/Region", "Lat", "Long", "Date"],
                how="left",
            )
            merged_df = pd.concat([merged_df, chunk])
        except KeyError as e:
            print(f"KeyError: {e}")
            print(f"Columns in chunk: {chunk.columns}")
            print(f"Columns in df2: {df2.columns}")
            break
        except MemoryError:
            print(f"MemoryError at chunk {start}-{end}")
            break
    return merged_df


# Merge datasets
merged_df = merge_in_chunks(confirmed_df, deaths_df, chunk_size=5000)
merged_df = merge_in_chunks(merged_df, recovered_df, chunk_size=5000)

# Check the merged DataFrame
print(merged_df.head())


  Province/State Country/Region       Lat       Long       Date  Confirmed  \
0            NaN    Afghanistan  33.93911  67.709953 2020-01-22          0   
1            NaN    Afghanistan  33.93911  67.709953 2020-01-23          0   
2            NaN    Afghanistan  33.93911  67.709953 2020-01-24          0   
3            NaN    Afghanistan  33.93911  67.709953 2020-01-25          0   
4            NaN    Afghanistan  33.93911  67.709953 2020-01-26          0   

   Deaths  Recovered  
0       0        0.0  
1       0        0.0  
2       0        0.0  
3       0        0.0  
4       0        0.0  


In [4]:
# Feature Engineering
merged_df["Daily_Confirmed"] = (
    merged_df.groupby(["Country/Region", "Province/State"])["Confirmed"]
    .diff()
    .fillna(0)
)
merged_df["Daily_Deaths"] = (
    merged_df.groupby(["Country/Region", "Province/State"])["Deaths"].diff().fillna(0)
)
merged_df["Daily_Recovered"] = (
    merged_df.groupby(["Country/Region", "Province/State"])["Recovered"]
    .diff()
    .fillna(0)
)


In [5]:
import pandas as pd

# Load the main population data
population_df = pd.read_csv(
    "API_SP.POP.TOTL_DS2_en_csv_v2_87.csv", skiprows=4
)  # Skipping the first few rows to get to the data


In [6]:
# Load country metadata
country_metadata_df = pd.read_csv(
    "Metadata_Country_API_SP.POP.TOTL_DS2_en_csv_v2_87.csv"
    
)
print(country_metadata_df.head())


  Country Code                     Region          IncomeGroup  \
0          ABW  Latin America & Caribbean          High income   
1          AFE                        NaN                  NaN   
2          AFG                 South Asia           Low income   
3          AFW                        NaN                  NaN   
4          AGO         Sub-Saharan Africa  Lower middle income   

                                        SpecialNotes  \
0                                                NaN   
1  26 countries, stretching from the Red Sea in t...   
2  The reporting period for national accounts dat...   
3  22 countries, stretching from the westernmost ...   
4  The World Bank systematically assesses the app...   

                     TableName  Unnamed: 5  
0                        Aruba         NaN  
1  Africa Eastern and Southern         NaN  
2                  Afghanistan         NaN  
3   Africa Western and Central         NaN  
4                       Angola         Na

In [7]:
# Clean column names
population_df.columns = population_df.columns.str.strip()

# Merge on the country name or country code
merged_df = pd.merge(
    merged_df,
    population_df[["Country Name", "2000"]],
    how="left",
    left_on="Country/Region",
    right_on="Country Name",
)

# Rename columns for clarity if needed
merged_df.rename(columns={"2000": "Population"}, inplace=True)

print(merged_df.head())


  Province/State Country/Region       Lat       Long       Date  Confirmed  \
0            NaN    Afghanistan  33.93911  67.709953 2020-01-22          0   
1            NaN    Afghanistan  33.93911  67.709953 2020-01-23          0   
2            NaN    Afghanistan  33.93911  67.709953 2020-01-24          0   
3            NaN    Afghanistan  33.93911  67.709953 2020-01-25          0   
4            NaN    Afghanistan  33.93911  67.709953 2020-01-26          0   

   Deaths  Recovered  Daily_Confirmed  Daily_Deaths  Daily_Recovered  \
0       0        0.0              0.0           0.0              0.0   
1       0        0.0              0.0           0.0              0.0   
2       0        0.0              0.0           0.0              0.0   
3       0        0.0              0.0           0.0              0.0   
4       0        0.0              0.0           0.0              0.0   

  Country Name  Population  
0  Afghanistan  19542982.0  
1  Afghanistan  19542982.0  
2  Afghanis

In [18]:
merged_df.to_csv("merged_df_population.csv", index=False)

In [8]:
merged_df_with_population = pd.read_csv("merged_df_population.csv",low_memory=False)

In [9]:
merged_df_with_population.columns

Index(['Province/State', 'Country/Region', 'Lat', 'Long', 'Date', 'Confirmed',
       'Deaths', 'Recovered', 'Daily_Confirmed', 'Daily_Deaths',
       'Daily_Recovered', 'Country Name', 'Population'],
      dtype='object')

In [10]:
# Define thresholds
infection_percent_threshold = 0.1  # Set to 0.1% for now
daily_cases_threshold = 100  # Set an absolute threshold for daily cases
rolling_window = 7  # 7-day rolling window to smooth the data

# Calculate infection rate
merged_df_with_population["Infection_Rate"] = (
    merged_df_with_population["Daily_Confirmed"]
    / merged_df_with_population["Population"]
) * 100

# Apply rolling averages for smoothing
merged_df_with_population["Rolling_Daily_Confirmed"] = (
    merged_df_with_population["Daily_Confirmed"].rolling(window=rolling_window).mean()
)
merged_df_with_population["Rolling_Infection_Rate"] = (
    merged_df_with_population["Infection_Rate"].rolling(window=rolling_window).mean()
)

# Hotspot is True if either the infection rate or the absolute daily cases exceed the thresholds
merged_df_with_population["Hotspot"] = (
    merged_df_with_population["Rolling_Infection_Rate"] >= infection_percent_threshold
) | (merged_df_with_population["Rolling_Daily_Confirmed"] >= daily_cases_threshold)

# Display the results
print(merged_df_with_population["Hotspot"].value_counts())


Hotspot
False    316879
True      13448
Name: count, dtype: int64


In [11]:
print(merged_df_with_population.columns)

Index(['Province/State', 'Country/Region', 'Lat', 'Long', 'Date', 'Confirmed',
       'Deaths', 'Recovered', 'Daily_Confirmed', 'Daily_Deaths',
       'Daily_Recovered', 'Country Name', 'Population', 'Infection_Rate',
       'Rolling_Daily_Confirmed', 'Rolling_Infection_Rate', 'Hotspot'],
      dtype='object')


In [37]:
%pip install joblib




In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
import joblib  # For saving the model

# Load your dataset
# merged_df_with_population = pd.read_csv('path_to_your_merged_data.csv')

# Select features and target
features = ["Daily_Confirmed", "Infection_Rate", "Rolling_Daily_Confirmed"]
target = "Hotspot"

# Drop rows with NaN values in selected columns
df_model = merged_df_with_population.dropna(subset=features + [target])

# Define X and y
X = df_model[features]
y = df_model[target]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train the KNN model
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)

# Save the model and scaler for use in the Flask app
joblib.dump(knn, "knn_model.pkl")
joblib.dump(scaler, "scaler.pkl")


['scaler.pkl']