In [268]:
import pandas as pd
import numpy as np

In [269]:
meta_file_path = './enterobase_meta.tsv'
allele_file_path = './enterobase_allele.tsv'

In [270]:
def balance_dataset(X, y, upsample=True):
    """
    Balance the dataset by either upsampling or downsampling.

    Parameters:
    - X: Features
    - y: Labels
    - upsample: If True, perform upsampling; if False, perform downsampling

    Returns:
    - X_balanced: Balanced features
    - y_balanced: Balanced labels
    """
    # Combine features and labels
    data = list(zip(X, y))

    # Find the class with the maximum and minimum number of examples
    class_counts = {label: data.count((X_i, label)) for X_i, label in data}
    max_class_count = max(class_counts.values())
    min_class_count = min(class_counts.values())

    # Separate data by class
    class_data = {label: [X_i for X_i, label_i in data if label_i == label] for label in set(y)}

    # Upsample or downsample each class
    balanced_data = []
    for label in class_data:
        if upsample:
            # Upsample by repeating examples
            class_data[label] = resample(class_data[label], n_samples=max_class_count, replace=True)
        else:
            # Downsample by randomly removing examples
            class_data[label] = resample(class_data[label], n_samples=min_class_count, replace=False)

        # Add balanced data to the final list
        balanced_data.extend([(X_i, label) for X_i in class_data[label]])

    # Shuffle the balanced data
    balanced_data = resample(balanced_data, n_samples=len(balanced_data), replace=False)

    # Separate features and labels again
    X_balanced, y_balanced = zip(*balanced_data)

    return X_balanced, y_balanced


In [271]:
meta = pd.read_csv(meta_file_path, sep='\t')
allele = pd.read_csv(allele_file_path, sep='\t')

meta = meta[['Region', 'Serovar', 'ST']]
allele.drop(['Name'], axis=1, inplace=True)
allele.replace({-1:0, '-1':0, '-':0}, inplace=True) # Set instances 'no gene present' to 0

  allele = pd.read_csv(allele_file_path, sep='\t')


In [272]:
province_mapping = {
    'Alberta': 'AB',
    'British Columbia': 'BC',
    'Manitoba': 'MB',
    'New Brunswick': 'NB',
    'Newfoundland and Labrador': 'NL',
    'Nova Scotia': 'NS',
    'Ontario': 'ON',
    'Prince Edward Island': 'PE',
    'Quebec': 'QC',
    'Saskatchewan': 'SK',
    'Northwest Territories': 'NT',
    'Nunavut': 'NU',
    'Yukon': 'YT'
}

meta['Region'] = meta['Region'].replace(province_mapping)


In [273]:
# Left merge meta and allele dataframe on "St" column
merged_df = pd.merge(meta, allele.drop_duplicates(subset='ST'), on='ST', how='left')

print("Merged DataFrame:")
print(merged_df)

Merged DataFrame:
     Region                  Serovar      ST  STMMW_33831 STMMW_14121  \
0        BC  Enteritidis (Predicted)   32875            1           1   
1        NB     Muenchen (Predicted)   32490            1           2   
2        ON              Paratyphi A   32489            1           2   
3        SK                Manhattan   32429            1           2   
4        ON                 Kentucky  106867            1           1   
...     ...                      ...     ...          ...         ...   
1150     ON                        I  388328            1           1   
1151     ON              Typhimurium  388329            1           1   
1152     ON              Typhimurium  388327            1           1   
1153     ON                 Muenchen  388326            1           2   
1154     ON             4,[5],12:I:-  388325            1           1   

      STMMW_00441  STMMW_10921  STMMW_11801  STMMW_36901  STMMW_30761  ...  \
0               1          

In [274]:
merged_df.drop(['Serovar', 'ST'], axis=1, inplace=True)

In [275]:
merged_df.loc[:, merged_df.columns != 'Region'] = merged_df.loc[:, merged_df.columns != 'Region'].astype(int) # Ensure all feature columns are numeric


In [276]:
threshold_percentage = 1

# Calculate the threshold value based on the percentage
threshold_value = threshold_percentage / 100.0 * len(merged_df)

# Identify columns with 5% or more of their values equal to 0
columns_to_remove = merged_df.columns[(merged_df == 0).sum() >= threshold_value]

print(merged_df.shape[1])
# Remove identified columns from the DataFrame
merged_df.drop(columns=columns_to_remove, inplace=True)
print(merged_df.shape[0])
province_counts = merged_df['Region'].value_counts()
print(province_counts)

# List of provinces to be removed
provinces_to_remove = province_counts[province_counts < 40].index.tolist()

print(provinces_to_remove)

# Filter rows based on the values in the 'Region' column
merged_df = merged_df[~merged_df['Region'].isin(provinces_to_remove)]
merged_df.reset_index(drop=True, inplace=True)
print(merged_df)

3003
1155
Region
ON    384
QC    316
BC    218
MB     85
AB     52
SK     48
NS     19
PE     15
NB     12
NL      6
Name: count, dtype: int64
['NS', 'PE', 'NB', 'NL']
     Region  STMMW_33831 STMMW_14121  STMMW_00441  STMMW_10921  STMMW_11801  \
0        BC            1           1            1            1            1   
1        ON            1           2            1           88            1   
2        SK            1           2            1            4            1   
3        ON            1           1            1            4            1   
4        ON            1           1            1            1            1   
...     ...          ...         ...          ...          ...          ...   
1098     ON            1           1            1            1            1   
1099     ON            1           1            1            1            1   
1100     ON            1           1            1            1            1   
1101     ON            1           2      

In [277]:
output_file_path = './enterobase_train_filtered.csv'
merged_df.to_csv(output_file_path, index=False)
