In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/corona-virus-report/covid_19_clean_complete.csv
/kaggle/input/corona-virus-report/country_wise_latest.csv
/kaggle/input/corona-virus-report/day_wise.csv
/kaggle/input/corona-virus-report/usa_county_wise.csv
/kaggle/input/corona-virus-report/worldometer_data.csv
/kaggle/input/corona-virus-report/full_grouped.csv


In [2]:
import pandas as pd
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')

# Load the dataset
df = pd.read_csv("/kaggle/input/corona-virus-report/covid_19_clean_complete.csv")
logging.info(f"Loaded covid_19_clean_complete with shape {df.shape}")

# Standardize column names
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

# Convert 'date' to datetime
df['date'] = pd.to_datetime(df['date'])

# Handle missing values
df['province/state'] = df['province/state'].fillna('Unknown')

# Create derived columns
df['active'] = df['confirmed'] - df['deaths'] - df['recovered']

# Validation
logging.info(f"Shape after preprocessing: {df.shape}")
logging.info(f"Missing values: {df.isnull().sum().sum()}")

# Save the cleaned dataset
df.to_csv("cleaned_covid_19_clean_complete.csv", index=False)
logging.info("Preprocessing complete. cleaned_covid_19_clean_complete.csv saved.")


In [3]:
import pandas as pd
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')

# Load the dataset
df = pd.read_csv("/kaggle/input/corona-virus-report/day_wise.csv")
logging.info(f"Loaded day_wise with shape {df.shape}")

# Standardize column names
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

# Convert 'date' to datetime
df['date'] = pd.to_datetime(df['date'])

# Add derived columns
df['case_fatality_rate'] = (df['deaths'] / df['confirmed'] * 100).fillna(0)

# Validation
logging.info(f"Shape after preprocessing: {df.shape}")
logging.info(f"Missing values: {df.isnull().sum().sum()}")

# Save the cleaned dataset
df.to_csv("cleaned_day_wise.csv", index=False)
logging.info("Preprocessing complete. cleaned_day_wise.csv saved.")


In [4]:
import pandas as pd
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')

# Load the dataset
df = pd.read_csv("/kaggle/input/corona-virus-report/country_wise_latest.csv")
logging.info(f"Loaded country_wise_latest with shape {df.shape}")

# Standardize column names
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

# Replace missing percentage values with 0
percentage_columns = [col for col in df.columns if '/100_' in col]
for col in percentage_columns:
    df[col] = df[col].fillna(0)

# Validation
logging.info(f"Shape after preprocessing: {df.shape}")
logging.info(f"Missing values: {df.isnull().sum().sum()}")

# Save the cleaned dataset
df.to_csv("cleaned_country_wise_latest.csv", index=False)
logging.info("Preprocessing complete. cleaned_country_wise_latest.csv saved.")


In [5]:
import pandas as pd
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')

# Load the dataset
df = pd.read_csv("/kaggle/input/corona-virus-report/full_grouped.csv")
logging.info(f"Loaded full_grouped with shape {df.shape}")

# Standardize column names
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

# Convert 'date' to datetime
df['date'] = pd.to_datetime(df['date'])

# Create derived columns
df['active'] = df['confirmed'] - df['deaths'] - df['recovered']

# Validation
logging.info(f"Shape after preprocessing: {df.shape}")
logging.info(f"Missing values: {df.isnull().sum().sum()}")

# Save the cleaned dataset
df.to_csv("cleaned_full_grouped.csv", index=False)
logging.info("Preprocessing complete. cleaned_full_grouped.csv saved.")


In [6]:
import pandas as pd
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')

# Load the dataset
df = pd.read_csv("/kaggle/input/corona-virus-report/usa_county_wise.csv")
logging.info(f"Loaded usa_county_wise with shape {df.shape}")

# Standardize column names
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

# Handle missing values
df['fips'] = df['fips'].fillna('Unknown')
df['admin2'] = df['admin2'].fillna('Unknown')

# Validation
logging.info(f"Shape after preprocessing: {df.shape}")
logging.info(f"Missing values: {df.isnull().sum().sum()}")

# Save the cleaned dataset
df.to_csv("cleaned_usa_county_wise.csv", index=False)
logging.info("Preprocessing complete. cleaned_usa_county_wise.csv saved.")


In [7]:
import pandas as pd
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')

# Load the dataset
df = pd.read_csv("/kaggle/input/corona-virus-report/worldometer_data.csv")
logging.info(f"Loaded worldometer_data with shape {df.shape}")

# Standardize column names
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

# Fill missing values with 0 or "Unknown"
df = df.fillna({'continent': 'Unknown', 'population': 0, 'newcases': 0, 'newdeaths': 0, 'newrecovered': 0})

# Validation
logging.info(f"Shape after preprocessing: {df.shape}")
logging.info(f"Missing values: {df.isnull().sum().sum()}")

# Save the cleaned dataset
df.to_csv("cleaned_worldometer_data.csv", index=False)
logging.info("Preprocessing complete. cleaned_worldometer_data.csv saved.")
