In [1]:
# import required packages
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.preprocessing import QuantileTransformer
from sklearn.linear_model import Ridge
# dictionary for AA -> integer; note does not have an error handling component for if KeyError
aa_to_int = {
    'M':1,
    'R':2,
    'H':3,
    'K':4,
    'D':5,
    'E':6,
    'S':7,
    'T':8,
    'N':9,
    'Q':10, 
    'C':11,
    'U':12,
    'G':13,
    'P':14,
    'A':15,
    'V':16,
    'I':17,
    'F':18,
    'Y':19,
    'W':20,
    'L':21,
    'O':22, #Pyrrolysine
    'X':23, # Unknown
    'Z':23, # Glutamic acid or GLutamine
    'B':23, # Asparagine or aspartic acid
    'J':23, # Leucine or isoleucine
    'start':24,
    'stop':25,
    '-':26,
}

def aa_seq_to_int(s):
    """
    Return the int sequence as a list for a given string of amino acids
    """
    return [aa_to_int[a] for a in s]

def format_seq(seq):
    """
    Takes an amino acid sequence, returns a list of integers based on dictionary aa_to_int
    """
    int_seq = aa_seq_to_int(seq.strip())
    return int_seq

# converts sequences to integers, insures all are same length
def format_batch_seqs(seqs):
    maxlen = -1
    for s in seqs:
        if len(s) > maxlen:
            maxlen = len(s)
    formatted = []
    for seq in seqs:
        pad_len = maxlen - len(seq)
        padded = np.pad(format_seq(seq), (0, pad_len), 'constant', constant_values=0)
        formatted.append(padded)
    return np.stack(formatted)

def seqs_to_onehot(seqs):
    X = np.zeros((seqs.shape[0], seqs.shape[1]*24), dtype=int)
    for i in range(seqs.shape[1]):
        for j in range(24):
            X[:, i*24+j] = (seqs[:, i] == j)
    return X

In [2]:
# prep OHE with position, full_var, AM_pathogenicity, and ESM1_b score 
def ohe_all(df):
    # pull variant sequences and one hot encode
    ls_df = df.loc[:,"full_var_seq"]
    df_temp1 = format_batch_seqs(ls_df)
    df_temp2 = seqs_to_onehot(df_temp1)
    # pull columns that I want to keep from df_ms1
    df_ohe = np.column_stack((df["pos"].values, df["full_var"].values, df["wt_aa"].values, df["var_aa"].values, df["am_pathogenicity"].values, df["ESM1b_score"].values,
                             df["Expr_z_score"].values, df["Migr_z_score"].values, df["Prolif_z_score"].values))
    # transform AM_path and ESM1b to percentile based scores
    qt_am_pathogenicity = QuantileTransformer(n_quantiles=1000, output_distribution="uniform")
    qt_ESM1b_score = QuantileTransformer(n_quantiles=1000, output_distribution="uniform")
    am_pathogenicity_quant = qt_am_pathogenicity.fit_transform(df_ohe[:, 4].reshape(-1, 1))  # Assuming "am_pathogenicity" is the 5th column
    ESM1b_score_quant = qt_ESM1b_score.fit_transform(df_ohe[:, 5].reshape(-1, 1))  # Assuming "ESM1b_score" is the 6th column
    # combine everything together and return as df
    df_ohe = np.column_stack((df_ohe, am_pathogenicity_quant, ESM1b_score_quant, df_temp2))  
    column_names = ["pos", "full_var", "wt_aa", "var_aa", "am_pathogenicity", "ESM1b_score", "Expr_z_score", "Migr_z_score", "Prolif_z_score",
                    "am_pathogenicity_quant", "ESM1b_score_quant"] + ['feature_' + str(i) for i in range(df_temp2.shape[1])]
    df_final = pd.DataFrame(df_ohe, columns=column_names)
    return df_final

# random sampling within each position 
def split_dataframe_random(df, k):
    # make sure it's not the WT sub
    df_filtered = df[df["wt_aa"] != df["var_aa"]]
    df_sample = df_filtered.sample(n=k)
    remaining_df = df.drop(df_sample.index)
    return df_sample, remaining_df

def run_ridge_prep(df_x, k, predictor):
    #Group the dataframe by pos and use split_data_frame function to randomly sample k rows (df1), df2 remainder 
    grouped = df_x.groupby('pos').apply(lambda x: split_dataframe_random(x, k))
    # Initialize empty dataframes for storing the results
    df1 = pd.DataFrame()
    df2 = pd.DataFrame()
    for _, (df_sample, remaining_df) in grouped.items():
        df1 = pd.concat([df1, df_sample])
        df2 = pd.concat([df2, remaining_df])
    oh1 = df1.iloc[:, 11:]
    oh2 = df2.iloc[:, 11:]
    train_x = np.column_stack((df1[predictor].values, oh1))
    test_x = np.column_stack((df2[predictor].values,oh2))
    return df1, df2, train_x, test_x

def run_ridge(train_x, train_y, test_x, test_y):
    # train linear regression, no alpha
    lm = Ridge() 
    lm.fit(train_x, train_y)
    pred_y = lm.predict(test_x)
    return pred_y


In [3]:
#import necessary sequences, ESM1b, and AM data 
df_var = pd.read_csv("C38_42_All_var_AM.csv", delimiter = ',', header = 0)
# subset with only missense variants (no synonymous)
df_ms = df_var[df_var["wt_aa"] != df_var["var_aa"]]
df_ms = df_ms.reset_index()

In [4]:
# One hot encode all the sequences
df_ohe_all = ohe_all(df_ms)

In [None]:
#Create df to hold the predicted values across iterations
df_am_ohllr_exp = df_ms[["full_var"]].copy()
df_am_ohllr_mig = df_ms[["full_var"]].copy()
df_am_ohllr_prol = df_ms[["full_var"]].copy()

for i in range(1000):
    # Random sampling + OHE, k = 2 (ie 2 var/pos in training set) 
    df1, df2, train_x, test_x = run_ridge_prep(df_ohe_all, 2, "am_pathogenicity_quant")
    # Linear regression for expression, then migration, then proliferation, adding pred values to dfs  
    iteration_name = f"pred_y_{i}"
    pred_y_exp = run_ridge(train_x, df1["Expr_z_score"].values, test_x, df2["Expr_z_score"].values)
    df_temp_exp = df2[["full_var"]].copy()
    df_temp_exp[iteration_name] = pred_y_exp
    df_am_ohllr_exp = df_am_ohllr_exp.merge(df_temp_exp, on = "full_var", how = "left")
    
    pred_y_mig = run_ridge(train_x, df1["Migr_z_score"].values, test_x, df2["Migr_z_score"].values)
    df_temp_mig = df2[["full_var"]].copy()
    df_temp_mig[iteration_name] = pred_y_mig
    df_am_ohllr_mig = df_am_ohllr_mig.merge(df_temp_mig, on = "full_var", how = "left")
    
    pred_y_prol = run_ridge(train_x, df1["Prolif_z_score"].values, test_x, df2["Prolif_z_score"].values)
    df_temp_prol = df2[["full_var"]].copy()
    df_temp_prol[iteration_name] = pred_y_prol
    df_am_ohllr_prol = df_am_ohllr_prol.merge(df_temp_prol, on = "full_var", how = "left")
    
    print(i)


In [26]:
df_am_ohllr_exp.to_csv("df_amquant_ohllr2_exp.csv")
df_am_ohllr_mig.to_csv("df_amquant_ohllr2_mig.csv")
df_am_ohllr_prol.to_csv("df_amquant_ohllr2_prol.csv")

In [None]:
#Create df to hold the predicted values across iterations
df_am_ohllr_exp = df_ms[["full_var"]].copy()
df_am_ohllr_mig = df_ms[["full_var"]].copy()
df_am_ohllr_prol = df_ms[["full_var"]].copy()

for i in range(1000):
    # Random sampling + OHE; k = 4 
    df1, df2, train_x, test_x = run_ridge_prep(df_ohe_all, 4, "am_pathogenicity_quant")
    # Linear regression for expression, then migration, then proliferation, adding pred values to dfs  
    iteration_name = f"pred_y_{i+100}"
    pred_y_exp = run_ridge(train_x, df1["Expr_z_score"].values, test_x, df2["Expr_z_score"].values)
    df_temp_exp = df2[["full_var"]].copy()
    df_temp_exp[iteration_name] = pred_y_exp
    df_am_ohllr_exp = df_am_ohllr_exp.merge(df_temp_exp, on = "full_var", how = "left")
    
    pred_y_mig = run_ridge(train_x, df1["Migr_z_score"].values, test_x, df2["Migr_z_score"].values)
    df_temp_mig = df2[["full_var"]].copy()
    df_temp_mig[iteration_name] = pred_y_mig
    df_am_ohllr_mig = df_am_ohllr_mig.merge(df_temp_mig, on = "full_var", how = "left")
    
    pred_y_prol = run_ridge(train_x, df1["Prolif_z_score"].values, test_x, df2["Prolif_z_score"].values)
    df_temp_prol = df2[["full_var"]].copy()
    df_temp_prol[iteration_name] = pred_y_prol
    df_am_ohllr_prol = df_am_ohllr_prol.merge(df_temp_prol, on = "full_var", how = "left")
    
    #print(i)


In [25]:
df_am_ohllr_exp.to_csv("df_amquant_ohllr4_exp.csv")
df_am_ohllr_mig.to_csv("df_amquant_ohllr4_mig.csv")
df_am_ohllr_prol.to_csv("df_amquant_ohllr4_prol.csv")

In [None]:
#Create df to hold the predicted values across iterations
df_am_ohllr_exp = df_ms[["full_var"]].copy()
df_am_ohllr_mig = df_ms[["full_var"]].copy()
df_am_ohllr_prol = df_ms[["full_var"]].copy()

for i in range(100):
    # Random sampling + OHE; k = 8
    df1, df2, train_x, test_x = run_ridge_prep(df_ohe_all, 8, "am_pathogenicity_quant")
    # Linear regression for expression, then migration, then proliferation, adding pred values to dfs  
    iteration_name = f"pred_y_{i}"
    pred_y_exp = run_ridge(train_x, df1["Expr_z_score"].values, test_x, df2["Expr_z_score"].values)
    df_temp_exp = df2[["full_var"]].copy()
    df_temp_exp[iteration_name] = pred_y_exp
    df_am_ohllr_exp = df_am_ohllr_exp.merge(df_temp_exp, on = "full_var", how = "left")
    
    pred_y_mig = run_ridge(train_x, df1["Migr_z_score"].values, test_x, df2["Migr_z_score"].values)
    df_temp_mig = df2[["full_var"]].copy()
    df_temp_mig[iteration_name] = pred_y_mig
    df_am_ohllr_mig = df_am_ohllr_mig.merge(df_temp_mig, on = "full_var", how = "left")
    
    pred_y_prol = run_ridge(train_x, df1["Prolif_z_score"].values, test_x, df2["Prolif_z_score"].values)
    df_temp_prol = df2[["full_var"]].copy()
    df_temp_prol[iteration_name] = pred_y_prol
    df_am_ohllr_prol = df_am_ohllr_prol.merge(df_temp_prol, on = "full_var", how = "left")
    
    #print(i)


In [9]:
df_am_ohllr_exp.to_csv("df_amquant_ohllr8_exp.csv")
df_am_ohllr_mig.to_csv("df_amquant_ohllr8_mig.csv")
df_am_ohllr_prol.to_csv("df_amquant_ohllr8_prol.csv")

In [None]:
#Create df to hold the predicted values across iterations
df_am_ohllr_exp = df_ms[["full_var"]].copy()
df_am_ohllr_mig = df_ms[["full_var"]].copy()
df_am_ohllr_prol = df_ms[["full_var"]].copy()

for i in range(100):
    # Random sampling + OHE; k = 12
    df1, df2, train_x, test_x = run_ridge_prep(df_ohe_all, 12, "am_pathogenicity_quant")
    # Linear regression for expression, then migration, then proliferation, adding pred values to dfs  
    iteration_name = f"pred_y_{i}"
    pred_y_exp = run_ridge(train_x, df1["Expr_z_score"].values, test_x, df2["Expr_z_score"].values)
    df_temp_exp = df2[["full_var"]].copy()
    df_temp_exp[iteration_name] = pred_y_exp
    df_am_ohllr_exp = df_am_ohllr_exp.merge(df_temp_exp, on = "full_var", how = "left")
    
    pred_y_mig = run_ridge(train_x, df1["Migr_z_score"].values, test_x, df2["Migr_z_score"].values)
    df_temp_mig = df2[["full_var"]].copy()
    df_temp_mig[iteration_name] = pred_y_mig
    df_am_ohllr_mig = df_am_ohllr_mig.merge(df_temp_mig, on = "full_var", how = "left")
    
    pred_y_prol = run_ridge(train_x, df1["Prolif_z_score"].values, test_x, df2["Prolif_z_score"].values)
    df_temp_prol = df2[["full_var"]].copy()
    df_temp_prol[iteration_name] = pred_y_prol
    df_am_ohllr_prol = df_am_ohllr_prol.merge(df_temp_prol, on = "full_var", how = "left")
    
    #print(i)


In [11]:
df_am_ohllr_exp.to_csv("df_amquant_ohllr12_exp.csv")
df_am_ohllr_mig.to_csv("df_amquant_ohllr12_mig.csv")
df_am_ohllr_prol.to_csv("df_amquant_ohllr12_prol.csv")

In [None]:
#Create df to hold the predicted values across iterations
df_am_ohllr_exp = df_ms[["full_var"]].copy()
df_am_ohllr_mig = df_ms[["full_var"]].copy()
df_am_ohllr_prol = df_ms[["full_var"]].copy()

for i in range(100):
    # Random sampling + OHE; k = 16
    df1, df2, train_x, test_x = run_ridge_prep(df_ohe_all, 16, "am_pathogenicity_quant")
    # Linear regression for expression, then migration, then proliferation, adding pred values to dfs  
    iteration_name = f"pred_y_{i}"
    pred_y_exp = run_ridge(train_x, df1["Expr_z_score"].values, test_x, df2["Expr_z_score"].values)
    df_temp_exp = df2[["full_var"]].copy()
    df_temp_exp[iteration_name] = pred_y_exp
    df_am_ohllr_exp = df_am_ohllr_exp.merge(df_temp_exp, on = "full_var", how = "left")
    
    pred_y_mig = run_ridge(train_x, df1["Migr_z_score"].values, test_x, df2["Migr_z_score"].values)
    df_temp_mig = df2[["full_var"]].copy()
    df_temp_mig[iteration_name] = pred_y_mig
    df_am_ohllr_mig = df_am_ohllr_mig.merge(df_temp_mig, on = "full_var", how = "left")
    
    pred_y_prol = run_ridge(train_x, df1["Prolif_z_score"].values, test_x, df2["Prolif_z_score"].values)
    df_temp_prol = df2[["full_var"]].copy()
    df_temp_prol[iteration_name] = pred_y_prol
    df_am_ohllr_prol = df_am_ohllr_prol.merge(df_temp_prol, on = "full_var", how = "left")
    
    #print(i)


In [15]:
df_am_ohllr_exp.to_csv("df_amquant_ohllr16_exp1.csv")
df_am_ohllr_mig.to_csv("df_amquant_ohllr16_mig1.csv")
df_am_ohllr_prol.to_csv("df_amquant_ohllr16_prol1.csv")

In [None]:
# ESM1b
# Create df to hold the predicted values across iterations
df_esm_ohllr_exp = df_ms[["full_var"]].copy()
df_esm_ohllr_mig = df_ms[["full_var"]].copy()
df_esm_ohllr_prol = df_ms[["full_var"]].copy()

for i in range(100):
    # Random sampling + OHE, ESM1b, k = 2 
    df1, df2, train_x, test_x = run_ridge_prep(df_ohe_all, 2, "ESM1b_score_quant")
    # Linear regression for expression, then migration, then proliferation, adding pred values to dfs  
    iteration_name = f"pred_y_{i}"
    pred_y_exp = run_ridge(train_x, df1["Expr_z_score"].values, test_x, df2["Expr_z_score"].values)
    df_temp_exp = df2[["full_var"]].copy()
    df_temp_exp[iteration_name] = pred_y_exp
    df_esm_ohllr_exp = df_esm_ohllr_exp.merge(df_temp_exp, on = "full_var", how = "left")
    
    pred_y_mig = run_ridge(train_x, df1["Migr_z_score"].values, test_x, df2["Migr_z_score"].values)
    df_temp_mig = df2[["full_var"]].copy()
    df_temp_mig[iteration_name] = pred_y_mig
    df_esm_ohllr_mig = df_esm_ohllr_mig.merge(df_temp_mig, on = "full_var", how = "left")
    
    pred_y_prol = run_ridge(train_x, df1["Prolif_z_score"].values, test_x, df2["Prolif_z_score"].values)
    df_temp_prol = df2[["full_var"]].copy()
    df_temp_prol[iteration_name] = pred_y_prol
    df_esm_ohllr_prol = df_esm_ohllr_prol.merge(df_temp_prol, on = "full_var", how = "left")
    
    #print(i)


In [7]:
df_esm_ohllr_exp.to_csv("df_esmquant_ohllr2_exp.csv")
df_esm_ohllr_mig.to_csv("df_esmquant_ohllr2_mig.csv")
df_esm_ohllr_prol.to_csv("df_esmquant_ohllr2_prol.csv")

In [None]:
#Create df to hold the predicted values across iterations
df_esm_ohllr_exp = df_ms[["full_var"]].copy()
df_esm_ohllr_mig = df_ms[["full_var"]].copy()
df_esm_ohllr_prol = df_ms[["full_var"]].copy()

for i in range(100):
    # Random sampling + OHE, k = 4
    df1, df2, train_x, test_x = run_ridge_prep(df_ohe_all, 4, "ESM1b_score_quant")
    # Linear regression for expression, then migration, then proliferation, adding pred values to dfs  
    iteration_name = f"pred_y_{i}"
    pred_y_exp = run_ridge(train_x, df1["Expr_z_score"].values, test_x, df2["Expr_z_score"].values)
    df_temp_exp = df2[["full_var"]].copy()
    df_temp_exp[iteration_name] = pred_y_exp
    df_esm_ohllr_exp = df_esm_ohllr_exp.merge(df_temp_exp, on = "full_var", how = "left")
    
    pred_y_mig = run_ridge(train_x, df1["Migr_z_score"].values, test_x, df2["Migr_z_score"].values)
    df_temp_mig = df2[["full_var"]].copy()
    df_temp_mig[iteration_name] = pred_y_mig
    df_esm_ohllr_mig = df_esm_ohllr_mig.merge(df_temp_mig, on = "full_var", how = "left")
    
    pred_y_prol = run_ridge(train_x, df1["Prolif_z_score"].values, test_x, df2["Prolif_z_score"].values)
    df_temp_prol = df2[["full_var"]].copy()
    df_temp_prol[iteration_name] = pred_y_prol
    df_esm_ohllr_prol = df_esm_ohllr_prol.merge(df_temp_prol, on = "full_var", how = "left")
    
   # print(i)


In [None]:
df_esm_ohllr_exp.to_csv("df_esmquant_ohllr4_exp.csv")
df_esm_ohllr_mig.to_csv("df_esmquant_ohllr4_mig.csv")
df_esm_ohllr_prol.to_csv("df_esmquant_ohllr4_prol.csv")

In [None]:
#Create df to hold the predicted values across iterations
df_esm_ohllr_exp = df_ms[["full_var"]].copy()
df_esm_ohllr_mig = df_ms[["full_var"]].copy()
df_esm_ohllr_prol = df_ms[["full_var"]].copy()

for i in range(100):
    # Random sampling + OHE, k = 8
    df1, df2, train_x, test_x = run_ridge_prep(df_ohe_all, 8, "ESM1b_score_quant")
    # Linear regression for expression, then migration, then proliferation, adding pred values to dfs  
    iteration_name = f"pred_y_{i}"
    pred_y_exp = run_ridge(train_x, df1["Expr_z_score"].values, test_x, df2["Expr_z_score"].values)
    df_temp_exp = df2[["full_var"]].copy()
    df_temp_exp[iteration_name] = pred_y_exp
    df_esm_ohllr_exp = df_esm_ohllr_exp.merge(df_temp_exp, on = "full_var", how = "left")
    
    pred_y_mig = run_ridge(train_x, df1["Migr_z_score"].values, test_x, df2["Migr_z_score"].values)
    df_temp_mig = df2[["full_var"]].copy()
    df_temp_mig[iteration_name] = pred_y_mig
    df_esm_ohllr_mig = df_esm_ohllr_mig.merge(df_temp_mig, on = "full_var", how = "left")
    
    pred_y_prol = run_ridge(train_x, df1["Prolif_z_score"].values, test_x, df2["Prolif_z_score"].values)
    df_temp_prol = df2[["full_var"]].copy()
    df_temp_prol[iteration_name] = pred_y_prol
    df_esm_ohllr_prol = df_esm_ohllr_prol.merge(df_temp_prol, on = "full_var", how = "left")
    
    #print(i)


In [17]:
df_esm_ohllr_exp.to_csv("df_esmquant_ohllr8_exp1.csv")
df_esm_ohllr_mig.to_csv("df_esmquant_ohllr8_mig1.csv")
df_esm_ohllr_prol.to_csv("df_esmquant_ohllr8_prol1.csv")

In [None]:
#Create df to hold the predicted values across iterations
df_esm_ohllr_exp = df_ms[["full_var"]].copy()
df_esm_ohllr_mig = df_ms[["full_var"]].copy()
df_esm_ohllr_prol = df_ms[["full_var"]].copy()

for i in range(100):
    # Random sampling + OHE, k = 12 
    df1, df2, train_x, test_x = run_ridge_prep(df_ohe_all, 12, "ESM1b_score_quant")
    # Linear regression for expression, then migration, then proliferation, adding pred values to dfs  
    iteration_name = f"pred_y_{i}"
    pred_y_exp = run_ridge(train_x, df1["Expr_z_score"].values, test_x, df2["Expr_z_score"].values)
    df_temp_exp = df2[["full_var"]].copy()
    df_temp_exp[iteration_name] = pred_y_exp
    df_esm_ohllr_exp = df_esm_ohllr_exp.merge(df_temp_exp, on = "full_var", how = "left")
    
    pred_y_mig = run_ridge(train_x, df1["Migr_z_score"].values, test_x, df2["Migr_z_score"].values)
    df_temp_mig = df2[["full_var"]].copy()
    df_temp_mig[iteration_name] = pred_y_mig
    df_esm_ohllr_mig = df_esm_ohllr_mig.merge(df_temp_mig, on = "full_var", how = "left")
    
    pred_y_prol = run_ridge(train_x, df1["Prolif_z_score"].values, test_x, df2["Prolif_z_score"].values)
    df_temp_prol = df2[["full_var"]].copy()
    df_temp_prol[iteration_name] = pred_y_prol
    df_esm_ohllr_prol = df_esm_ohllr_prol.merge(df_temp_prol, on = "full_var", how = "left")
    
   # print(i)


In [19]:
df_esm_ohllr_exp.to_csv("df_esmquant_ohllr12_exp.csv")
df_esm_ohllr_mig.to_csv("df_esmquant_ohllr12_mig.csv")
df_esm_ohllr_prol.to_csv("df_esmquant_ohllr12_prol.csv")

In [None]:
#Create df to hold the predicted values across iterations
df_esm_ohllr_exp = df_ms[["full_var"]].copy()
df_esm_ohllr_mig = df_ms[["full_var"]].copy()
df_esm_ohllr_prol = df_ms[["full_var"]].copy()

for i in range(100):
    # Random sampling + OHE, k = 16
    df1, df2, train_x, test_x = run_ridge_prep(df_ohe_all, 16, "ESM1b_score_quant")
    # Linear regression for expression, then migration, then proliferation, adding pred values to dfs  
    iteration_name = f"pred_y_{i}"
    pred_y_exp = run_ridge(train_x, df1["Expr_z_score"].values, test_x, df2["Expr_z_score"].values)
    df_temp_exp = df2[["full_var"]].copy()
    df_temp_exp[iteration_name] = pred_y_exp
    df_esm_ohllr_exp = df_esm_ohllr_exp.merge(df_temp_exp, on = "full_var", how = "left")
    
    pred_y_mig = run_ridge(train_x, df1["Migr_z_score"].values, test_x, df2["Migr_z_score"].values)
    df_temp_mig = df2[["full_var"]].copy()
    df_temp_mig[iteration_name] = pred_y_mig
    df_esm_ohllr_mig = df_esm_ohllr_mig.merge(df_temp_mig, on = "full_var", how = "left")
    
    pred_y_prol = run_ridge(train_x, df1["Prolif_z_score"].values, test_x, df2["Prolif_z_score"].values)
    df_temp_prol = df2[["full_var"]].copy()
    df_temp_prol[iteration_name] = pred_y_prol
    df_esm_ohllr_prol = df_esm_ohllr_prol.merge(df_temp_prol, on = "full_var", how = "left")
    
    #print(i)


In [21]:
df_esm_ohllr_exp.to_csv("df_esmquant_ohllr16_exp.csv")
df_esm_ohllr_mig.to_csv("df_esmquant_ohllr16_mig.csv")
df_esm_ohllr_prol.to_csv("df_esmquant_ohllr16_prol.csv")