In [68]:
import pandas as pd

# Replace 'file_path.tsv' with the actual path to your TSV file
file_path = 'raw_data/1718053372032.metadata.tsv'

# Read the TSV file
rawdata = pd.read_csv(file_path, sep='\t')

# Display the first few rows of the dataframe
print(rawdata.head())


                               strain            virus    gisaid_epi_isl  \
0  hCoV-19/Australia/QLD0x0140B0/2024  betacoronavirus  EPI_ISL_18979667   
1  hCoV-19/Australia/QLD0x01418D/2024  betacoronavirus  EPI_ISL_19009760   
2  hCoV-19/Australia/QLD0x014092/2024  betacoronavirus  EPI_ISL_18979706   
3  hCoV-19/Australia/QLD0x01410E/2024  betacoronavirus  EPI_ISL_18979645   
4    hCoV-19/Australia/TAS006878/2024  betacoronavirus  EPI_ISL_18967246   

  genbank_accession        date   region    country    division location  \
0                 ?  2024-02-08  Oceania  Australia  Queensland      NaN   
1                 ?  2024-03-12  Oceania  Australia  Queensland      NaN   
2                 ?  2024-02-05  Oceania  Australia  Queensland      NaN   
3                 ?  2024-02-05  Oceania  Australia  Queensland      NaN   
4                 ?  2024-02-01  Oceania  Australia    Tasmania      NaN   

  region_exposure  ... pangolin_lineage GISAID_clade  \
0         Oceania  ...        

In [69]:
data = rawdata

def covid_tests_data_cleaning(data):
    # Filter
    data = data[(data['region'] == 'Oceania') & (data['country'] == 'Australia')]
    data = data[data['host'] == 'Human']
    data[['age', 'sex']] = data[['age', 'sex']].replace('unknown', pd.NA)

    # List of columns to keep
    columns_to_keep = [
        # virus
        'strain', 'virus', 'segment', 'length', 'gisaid_epi_isl', 'date', 
        # location
        'division', 'location', 
        # exposure
        'region_exposure', 'country_exposure', 'division_exposure',
        # patient 
        'age', 'sex', 
        # submission
        'originating_lab', 'submitting_lab', 'date_submitted'
    ]

    # Keep only the specified columns
    data = data.loc[:, columns_to_keep]

    # Change data type
    data['date'] = pd.to_datetime(data['date'])
    data['date_submitted'] = pd.to_datetime(data['date_submitted'])
    data['age'] = pd.to_numeric(data['age'], errors='coerce')


    return data

data = covid_tests_data_cleaning(data)

# Display the first few rows of the filtered dataframe
print(data.head())

                               strain            virus    gisaid_epi_isl  \
0  hCoV-19/Australia/QLD0x0140B0/2024  betacoronavirus  EPI_ISL_18979667   
1  hCoV-19/Australia/QLD0x01418D/2024  betacoronavirus  EPI_ISL_19009760   
2  hCoV-19/Australia/QLD0x014092/2024  betacoronavirus  EPI_ISL_18979706   
3  hCoV-19/Australia/QLD0x01410E/2024  betacoronavirus  EPI_ISL_18979645   
4    hCoV-19/Australia/TAS006878/2024  betacoronavirus  EPI_ISL_18967246   

        date    division location region_exposure country_exposure  \
0 2024-02-08  Queensland      NaN         Oceania        Australia   
1 2024-03-12  Queensland      NaN         Oceania        Australia   
2 2024-02-05  Queensland      NaN         Oceania        Australia   
3 2024-02-05  Queensland      NaN         Oceania        Australia   
4 2024-02-01    Tasmania      NaN         Oceania        Australia   

  division_exposure segment  length  age   sex  \
0        Queensland  genome   29717  NaN  <NA>   
1        Queensland  g

In [86]:
# Read All tsv files
def covid_tests_data_cleaning(data):
    # Filter
    data = data[(data['region'] == 'Oceania') & (data['country'] == 'Australia')]
    data = data[data['host'] == 'Human']
    data[['age', 'sex']] = data[['age', 'sex']].replace('unknown', pd.NA)

    # List of columns to keep
    columns_to_keep = [
        # virus
        'strain', 'virus', 'segment', 'length', 'gisaid_epi_isl', 'date', 
        # location
        'division', 'location', 
        # exposure
        'region_exposure', 'country_exposure', 'division_exposure',
        # patient 
        'age', 'sex', 
        # submission
        'originating_lab', 'submitting_lab', 'date_submitted'
    ]

    # Keep only the specified columns
    data = data.loc[:, columns_to_keep]

    # Change data type
    
    # Convert to datetime using the specified formats
    def parse_dates(date_series):
        date_formats = ["%d/%m/%Y", "%Y-%m-%d"]
        for fmt in date_formats:
            parsed_dates = pd.to_datetime(date_series, format=fmt, errors='coerce')
            if parsed_dates.notna().all():
                return parsed_dates
        return pd.to_datetime(date_series, errors='coerce') 
    
    data['date'] = parse_dates(data['date'])
    data = data.dropna(subset=['date'])
    data['date_submitted'] = parse_dates(data['date_submitted'])
    
    data['age'] = pd.to_numeric(data['age'], errors='coerce')


    return data

import os

# Define the directory containing the TSV files
dir_path = 'raw_data'

# Initialize an empty list to hold the DataFrames
dataframes = []

# Loop through all files in the directory
for file_name in os.listdir(dir_path):
    # Check if the file is a TSV file
    if file_name.endswith('.tsv'):
        # Read the TSV file into a DataFrame
        file_path = os.path.join(dir_path, file_name)
        df = pd.read_csv(file_path, sep='\t')
        # Append the DataFrame to the list
        try:
            dataframes.append(covid_tests_data_cleaning(df))
        except Exception as e:
            print(f"Error in loading {file_name}\n{e}")

data = pd.concat(dataframes, ignore_index=True)

# Display the first few rows of the filtered dataframe
print(data.head())

                                  strain            virus segment  length  \
0         hCoV-19/Australia/ACT0922/2021  betacoronavirus  genome   29752   
1         hCoV-19/Australia/ACT0896/2021  betacoronavirus  genome   29752   
2  hCoV-19/Australia/NSW-ICPMR-8097/2021  betacoronavirus  genome   29378   
3         hCoV-19/Australia/ACT0793/2021  betacoronavirus  genome   29752   
4         hCoV-19/Australia/ACT0807/2021  betacoronavirus  genome   29752   

    gisaid_epi_isl       date                      division location  \
0  EPI_ISL_4636733 2021-09-28  Australian Capital Territory      NaN   
1  EPI_ISL_4636712 2021-09-27  Australian Capital Territory      NaN   
2  EPI_ISL_4552404 2021-09-13               New South Wales   Sydney   
3  EPI_ISL_4636623 2021-09-23  Australian Capital Territory      NaN   
4  EPI_ISL_4636635 2021-09-23  Australian Capital Territory      NaN   

  region_exposure country_exposure             division_exposure  age   sex  \
0         Oceania        

In [88]:
print(data.shape)

(233848, 16)


In [87]:
# Assuming 'data' is your DataFrame
na_counts = data.isna().sum()

print(na_counts)

strain                    0
virus                     0
segment                   0
length                    0
gisaid_epi_isl            0
date                      0
division                  7
location             167273
region_exposure           0
country_exposure          0
division_exposure         7
age                  155677
sex                  143008
originating_lab         128
submitting_lab            0
date_submitted            0
dtype: int64


In [81]:

# Check if 'gisaid_epi_isl' is a unique key
is_unique_key = data['gisaid_epi_isl'].nunique() == len(data)

if is_unique_key:
    print("'gisaid_epi_isl' is a unique key.")
else:
    print("'gisaid_epi_isl' is not a unique key.")

'gisaid_epi_isl' is a unique key.


In [82]:

# Display the first few rows of the dataframe
print(*data.columns, sep='\n')


strain
virus
segment
length
gisaid_epi_isl
date
division
location
region_exposure
country_exposure
division_exposure
age
sex
originating_lab
submitting_lab
date_submitted
