In [24]:
import pandas as pd
df = pd.read_parquet('raw_data/covid_data.parquet')

strain_df = pd.read_csv('raw_data/lineage_label.csv')

df.columns

Index(['id', 'lineage', 'strain', 'date', 'division', 'location',
       'region_exposure', 'country_exposure', 'division_exposure', 'age',
       'sex', 'originating_lab', 'submitting_lab', 'date_submitted'],
      dtype='object')

In [37]:
uni_strain = pd.Series(df['lineage'].unique())
uni_strain

0        AY.39.1.3
1        AY.39.1.2
2          AY.39.1
3             AY.5
4        B.1.617.2
           ...    
1738       AY.43.4
1739        AY.111
1740         AY.92
1741    AY.124.1.1
1742       AY.98.1
Length: 1743, dtype: object

In [43]:
strain_df

Unnamed: 0,lineage,label
0,AY.10,Delta
1,AY.100,Delta
2,AY.101,Delta
3,AY.102,Delta
4,AY.103,Delta
...,...,...
1708,BE.1,Omicron
1709,EG.5,Omicron
1710,HK.3,Omicron
1711,XBB.2,Omicron


In [48]:

# Function to find the matching prefix and label
def find_matching_prefix_and_label(uni_strain_value, lineage_series, label_series):
    for lineage, label in zip(lineage_series, label_series):
        if uni_strain_value.startswith(lineage):
            return lineage, label
    return None, None

# Apply the function to find matches
matches = uni_strain.apply(lambda x: find_matching_prefix_and_label(x, strain_df['lineage'], strain_df['label']))

# Split the matches into two separate columns
matching_prefixes, matching_labels = zip(*matches)

# Create a DataFrame with the matching ones
match_df = pd.DataFrame({
    'lineage': uni_strain[matches.apply(lambda x: x[0] is not None)],
    'label': [label for label in matching_labels if label is not None]
})


# Display the DataFrames
print("Matching DataFrame:\n")
print(match_df)
match_df.to_csv("raw_data/full_lineage_label.csv", index=False)

Matching DataFrame:

         lineage  label
0      AY.39.1.3  Delta
1      AY.39.1.2  Delta
2        AY.39.1  Delta
3           AY.5  Delta
4      B.1.617.2  Delta
...          ...    ...
1738     AY.43.4  Delta
1739      AY.111  Delta
1740       AY.92  Delta
1741  AY.124.1.1  Delta
1742     AY.98.1  Delta

[1320 rows x 2 columns]


In [11]:
# Function to check if any lineage starts with a prefix from uni_strain
def has_matching_prefix(lineage, prefixes):
    return any(lineage.startswith(prefix) for prefix in prefixes)

# Apply the function to check for matching prefixes
matching_prefixes = strain_df['lineage'].apply(lambda x: has_matching_prefix(x, uni_strain))

# Count the number of matching prefixes
num_matching_prefixes = matching_prefixes.sum()

print(f"Number of strings in uni_strain with a matching prefix in strain_df['lineage']: {num_matching_prefixes} out of {len(uni_strain)}")


Number of strings in uni_strain with a matching prefix in strain_df['lineage']: 1512 out of 1743
