In [22]:
#load data
import pandas as pd

# Load the datasets
df_light_train = pd.read_csv('data/train_challenge_light.csv', usecols=['sequence_light'])
df_heavy_train = pd.read_csv('data/train_challenge_heavy.csv', usecols=['sequence_heavy'])
df_light_test = pd.read_csv('data/test_challenge_light.csv', usecols=['sequence_light'])
df_heavy_test = pd.read_csv('data/test_challenge_heavy.csv', usecols=['sequence_heavy'])

In [23]:
print(df_light_train.head())
print(df_heavy_train.head())
print(df_light_test.head())
print(df_light_test.head())

                                      sequence_light
0  TGGGGCTCCAAACAGAGCTTCAGCAAGCATAGTGGGAATCTGCACC...
1  GCTGGGGTCTCAGGAGGCAGCGCTCTCAGGACATCTCCACCATGGC...
2  TGGGGGATCAGGACTCCTCAGTTCACCTTCTCACAATGAGGCTCCC...
3  GGCTGGGGTCTCAGGAGGCAGCACTCTCGGGACGTCTCCACCATGG...
4  TCTGAGGATACGCGTGACAGATAAGAAGGGCTGGTGGGATCAGTCC...
                                      sequence_heavy
0  AGATCTCAGAGAGGAGCCTTAGCCCTGGACTCCAAGGCCTTTCCAC...
1  GGGGAGCTCTGGGAGAGGAGCCCCAGCCCTGAGATTCCCAGGTGTT...
2  ATCATCCAACAACCACATCCCTTCTCTACAGAAGCCTCTGAGAGGA...
3  GGGATGCTTTCTGAGAGTCATGGATCTCATGTGCAAGAAAATGAAG...
4  ACAACCACACCCCTCCTAAGAAGAAGACCCTAGACCACAGCTCCAC...
                                      sequence_light
0  GGGGACTGATCAGGACTCCTCAGTTCACCTTCTCACAATGAGGCTC...
1  TGGGGAGAGCTCTGGGGAGGAACTGCTCAGTTAGGACCCAGACGGA...
2  GGAGGAGTCAGACCCACTCAGGACACAGCATGGACATGAGGGTCCC...
3  GGAGAAGAGCTGCTCAGTTAGGACCCAGAGGGAACCATGGAAACCC...
4  CTGGGGGATCAGGACTCCTCAGTTCACCTTCTCACAATGAGGCTCC...
                                      sequence

In [24]:
# Define a function to create a mapping from characters to integers
def create_char_to_int_map(*sequences):
    unique_chars = set(''.join(''.join(seq) for seq in sequences))
    char_to_int = {char: idx for idx, char in enumerate(sorted(unique_chars))}
    return char_to_int

# Apply encoding to sequences
def encode_sequences(sequences, char_to_int):
    encoded_sequences = [[char_to_int[char] for char in sequence] for sequence in sequences]
    return encoded_sequences

# Create a unified mapping using both training and test data
char_to_int_light = create_char_to_int_map(df_light_train['sequence_light'], df_light_test['sequence_light'])
char_to_int_heavy = create_char_to_int_map(df_heavy_train['sequence_heavy'], df_heavy_test['sequence_heavy'])

# Encode sequences for both training and test datasets
df_light_train['encoded_sequence_light'] = encode_sequences(df_light_train['sequence_light'], char_to_int_light)
df_heavy_train['encoded_sequence_heavy'] = encode_sequences(df_heavy_train['sequence_heavy'], char_to_int_heavy)
df_light_test['encoded_sequence_light'] = encode_sequences(df_light_test['sequence_light'], char_to_int_light)
df_heavy_test['encoded_sequence_heavy'] = encode_sequences(df_heavy_test['sequence_heavy'], char_to_int_heavy)

In [25]:
# Example of how the encoded data looks
print(df_light_train.head())
print(df_heavy_train.head())
print(df_light_test.head())
print(df_heavy_test.head())

                                      sequence_light  \
0  TGGGGCTCCAAACAGAGCTTCAGCAAGCATAGTGGGAATCTGCACC...   
1  GCTGGGGTCTCAGGAGGCAGCGCTCTCAGGACATCTCCACCATGGC...   
2  TGGGGGATCAGGACTCCTCAGTTCACCTTCTCACAATGAGGCTCCC...   
3  GGCTGGGGTCTCAGGAGGCAGCACTCTCGGGACGTCTCCACCATGG...   
4  TCTGAGGATACGCGTGACAGATAAGAAGGGCTGGTGGGATCAGTCC...   

                              encoded_sequence_light  
0  [3, 2, 2, 2, 2, 1, 3, 1, 1, 0, 0, 0, 1, 0, 2, ...  
1  [2, 1, 3, 2, 2, 2, 2, 3, 1, 3, 1, 0, 2, 2, 0, ...  
2  [3, 2, 2, 2, 2, 2, 0, 3, 1, 0, 2, 2, 0, 1, 3, ...  
3  [2, 2, 1, 3, 2, 2, 2, 2, 3, 1, 3, 1, 0, 2, 2, ...  
4  [3, 1, 3, 2, 0, 2, 2, 0, 3, 0, 1, 2, 1, 2, 3, ...  
                                      sequence_heavy  \
0  AGATCTCAGAGAGGAGCCTTAGCCCTGGACTCCAAGGCCTTTCCAC...   
1  GGGGAGCTCTGGGAGAGGAGCCCCAGCCCTGAGATTCCCAGGTGTT...   
2  ATCATCCAACAACCACATCCCTTCTCTACAGAAGCCTCTGAGAGGA...   
3  GGGATGCTTTCTGAGAGTCATGGATCTCATGTGCAAGAAAATGAAG...   
4  ACAACCACACCCCTCCTAAGAAGAAGACCCTAGACCACAGCTCCAC... 

In [29]:
import h2o
from h2o.automl import H2OAutoML

# Initialize H2O
h2o.init()

# Convert list of integers to a string format
df_light_train['encoded_sequence_light'] = df_light_train['encoded_sequence_light'].apply(lambda x: ' '.join(map(str, x)))
df_heavy_train['encoded_sequence_heavy'] = df_heavy_train['encoded_sequence_heavy'].apply(lambda x: ' '.join(map(str, x)))

# Now convert the pandas DataFrame to an H2O Frame
hf_light_train = h2o.H2OFrame(df_light_train)
hf_heavy_train = h2o.H2OFrame(df_heavy_train)

# Combine light and heavy train dataframes
hf_train_combined = hf_light_train.cbind(hf_heavy_train)

# Specify the target column, which is 'encoded_sequence_heavy'
target_column = 'encoded_sequence_heavy'

# Initialize H2O
h2o.init()

# Run AutoML
aml = H2OAutoML(max_runtime_secs=3600, seed=1, project_name="Sequence_Modeling")
aml.train(y=target_column, training_frame=hf_train_combined)

# Print the AutoML Leaderboard
print(aml.leaderboard)

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,1 hour 22 mins
H2O_cluster_timezone:,America/Los_Angeles
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.2
H2O_cluster_version_age:,26 days
H2O_cluster_name:,H2O_from_python_maxhager_inpah3
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,2.308 Gb
H2O_cluster_total_cores:,12
H2O_cluster_allowed_cores:,12


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |███████████████████████████████████████████████████████████████

In [None]:
#oncee

In [27]:
print(df_light_train.columns)
#print(df_heavy_train.columns)
#print(df_light_test.columns)
#print(df_heavy_test.columns)

Index(['sequence_light', 'encoded_sequence_light'], dtype='object')


In [26]:
import h2o
from h2o.automl import H2OAutoML

# Initialize H2O
h2o.init()

# Assuming df_light_train has a column 'target' which are your labels
# Convert pandas DataFrame to H2O Frame
hf_light_train = h2o.H2OFrame(df_light_train)
hf_heavy_train = h2o.H2OFrame(df_heavy_train)

# Combine light and heavy train dataframes if necessary
# Here we assume that each light sequence has a corresponding heavy sequence in the same row
hf_train_combined = hf_light_train.cbind(hf_heavy_train)

# Specify the target column, ensure it exists in hf_train_combined
target_column = 'target_column_name'  # replace 'target_column_name' with the actual name of your target column

# Run AutoML
aml = H2OAutoML(max_runtime_secs=3600, seed=1, project_name="Sequence_Modeling")
aml.train(y=target_column, training_frame=hf_train_combined)

# Print the AutoML Leaderboard
print(aml.leaderboard)

# Optionally, evaluate the model on a test set if you have one
# hf_test_combined = h2o.H2OFrame(df_test)  # Assuming df_test is your test set DataFrame
# perf = aml.leader.model_performance(hf_test_combined)
# print(perf)

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,1 hour 7 mins
H2O_cluster_timezone:,America/Los_Angeles
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.2
H2O_cluster_version_age:,26 days
H2O_cluster_name:,H2O_from_python_maxhager_inpah3
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,2.308 Gb
H2O_cluster_total_cores:,12
H2O_cluster_allowed_cores:,12


ValueError: `python_obj` is not a list of flat lists!