In [1]:
# now convert into the ohe from chloe hsu code
import numpy as np
# stuff to perform chloe hsu augment
# note does not have an error handling component for if KeyError
aa_to_int = {
    'M':1,
    'R':2,
    'H':3,
    'K':4,
    'D':5,
    'E':6,
    'S':7,
    'T':8,
    'N':9,
    'Q':10, 'C':11,
    'U':12,
    'G':13,
    'P':14,
    'A':15,
    'V':16,
    'I':17,
    'F':18,
    'Y':19,
    'W':20,
    'L':21,
    'O':22, #Pyrrolysine
    'X':23, # Unknown
    'Z':23, # Glutamic acid or GLutamine
    'B':23, # Asparagine or aspartic acid
    'J':23, # Leucine or isoleucine
    'start':24,
    'stop':25,
    '-':26,
}

def aa_seq_to_int(s):
    """
    Return the int sequence as a list for a given string of amino acids
    """
    return [aa_to_int[a] for a in s]

def format_seq(seq):
    """
    Takes an amino acid sequence, returns a list of integers based on dictionary aa_to_int
    """
    int_seq = aa_seq_to_int(seq.strip())
    return int_seq

# converts sequences to integers, insures all are same length
def format_batch_seqs(seqs):
    maxlen = -1
    for s in seqs:
        if len(s) > maxlen:
            maxlen = len(s)
    formatted = []
    for seq in seqs:
        pad_len = maxlen - len(seq)
        padded = np.pad(format_seq(seq), (0, pad_len), 'constant', constant_values=0)
        formatted.append(padded)
    return np.stack(formatted)

def seqs_to_onehot(seqs):
    X = np.zeros((seqs.shape[0], seqs.shape[1]*24), dtype=int)
    for i in range(seqs.shape[1]):
        for j in range(24):
            X[:, i*24+j] = (seqs[:, i] == j)
    return X

In [2]:
# prep OHE with position, full_var, AM_pathogenicity, and ESM1_b score 
from sklearn.preprocessing import QuantileTransformer
def ohe_all(df):
    # pull variant sequences and one hot encode
    ls_df = df.loc[:,"full_var_seq"]
    df_temp1 = format_batch_seqs(ls_df)
    df_temp2 = seqs_to_onehot(df_temp1)
    # pull columns that I want to keep from df_ms1
    df_ohe = np.column_stack((df["pos"].values, df["full_var"].values, df["wt_aa"].values, df["var_aa"].values, df["am_pathogenicity"].values, df["ESM1b_score"].values,
                             df["Expr_z_score"].values, df["Migr_z_score"].values, df["Prolif_z_score"].values))
    # transform AM_path and ESM1b to percentile based scores
    qt_am_pathogenicity = QuantileTransformer(n_quantiles=1000, output_distribution="uniform")
    qt_ESM1b_score = QuantileTransformer(n_quantiles=1000, output_distribution="uniform")
    am_pathogenicity_quant = qt_am_pathogenicity.fit_transform(df_ohe[:, 4].reshape(-1, 1))  # Assuming "am_pathogenicity" is the 5th column
    ESM1b_score_quant = qt_ESM1b_score.fit_transform(df_ohe[:, 5].reshape(-1, 1))  # Assuming "ESM1b_score" is the 6th column
    # combine everything together and return as df
    df_ohe = np.column_stack((df_ohe, am_pathogenicity_quant, ESM1b_score_quant, df_temp2))  
    column_names = ["pos", "full_var", "wt_aa", "var_aa", "am_pathogenicity", "ESM1b_score", "Expr_z_score", "Migr_z_score", "Prolif_z_score",
                    "am_pathogenicity_quant", "ESM1b_score_quant"] + ['feature_' + str(i) for i in range(df_temp2.shape[1])]
    df_final = pd.DataFrame(df_ohe, columns=column_names)
    return df_final

# random sampling within each position 
def split_dataframe_random(df, k):
    # make sure it's not the WT sub
    df_filtered = df[df["wt_aa"] != df["var_aa"]]
    df_sample = df_filtered.sample(n=k)
    remaining_df = df.drop(df_sample.index)
    return df_sample, remaining_df

def run_ridge_prep(df_x, k, predictor):
    #Group the dataframe by pos and use split_data_frame function to randomly sample k rows (df1), df2 remainder 
    grouped = df_x.groupby('pos').apply(lambda x: split_dataframe_random(x, k))
    # Initialize empty dataframes for storing the results
    df1 = pd.DataFrame()
    df2 = pd.DataFrame()
    for _, (df_sample, remaining_df) in grouped.items():
        df1 = pd.concat([df1, df_sample])
        df2 = pd.concat([df2, remaining_df])
    oh1 = df1.iloc[:, 11:]
    oh2 = df2.iloc[:, 11:]
    train_x = np.column_stack((df1[predictor].values, oh1))
    test_x = np.column_stack((df2[predictor].values,oh2))
    return df1, df2, train_x, test_x

from scipy import stats

from sklearn.linear_model import Ridge
def run_ridge(train_x, train_y, test_x, test_y):
    # train default linear regression
    lm = Ridge() 
    lm.fit(train_x, train_y)
    pred_y = lm.predict(test_x)
    return pred_y


In [3]:
#import necessary sequence, ESM1b, and AM data 
import pandas as pd
df_var = pd.read_csv("C38_42_All_var_AM.csv", delimiter = ',', header = 0)
# subset with only missense variants (no synonymous)
df_ms = df_var[df_var["wt_aa"] != df_var["var_aa"]]
df_ms = df_ms.reset_index()


In [4]:
# Regression with seqs one-hot + am pathogenicity 
# One hot encode all the sequences
df_ohe_all = ohe_all(df_ms)


In [25]:
#Create df to hold the predicted values across iterations
df_am_ohllr_exp = df_ms[["full_var"]].copy()
df_am_ohllr_mig = df_ms[["full_var"]].copy()
df_am_ohllr_prol = df_ms[["full_var"]].copy()

for i in range(200):
    # Random sampling + OHE 
    df1, df2, train_x, test_x = run_ridge_prep(df_ohe_all, 2, "am_pathogenicity")
    # Linear regression for expression, then migration, then proliferation, adding pred values to dfs  
    iteration_name = f"pred_y_{i+800}"
    pred_y_exp = run_ridge(train_x, df1["Expr_z_score"].values, test_x, df2["Expr_z_score"].values)
    df_temp_exp = df2[["full_var"]].copy()
    df_temp_exp[iteration_name] = pred_y_exp
    df_am_ohllr_exp = df_am_ohllr_exp.merge(df_temp_exp, on = "full_var", how = "left")
    
    pred_y_mig = run_ridge(train_x, df1["Migr_z_score"].values, test_x, df2["Migr_z_score"].values)
    df_temp_mig = df2[["full_var"]].copy()
    df_temp_mig[iteration_name] = pred_y_mig
    df_am_ohllr_mig = df_am_ohllr_mig.merge(df_temp_mig, on = "full_var", how = "left")
    
    pred_y_prol = run_ridge(train_x, df1["Prolif_z_score"].values, test_x, df2["Prolif_z_score"].values)
    df_temp_prol = df2[["full_var"]].copy()
    df_temp_prol[iteration_name] = pred_y_prol
    df_am_ohllr_prol = df_am_ohllr_prol.merge(df_temp_prol, on = "full_var", how = "left")
    
    print(i)


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199


In [26]:
df_am_ohllr_exp.to_csv("df_am_ohllr_exp9-10.csv")
df_am_ohllr_mig.to_csv("df_am_ohllr_mig9-10.csv")
df_am_ohllr_prol.to_csv("df_am_ohllr_prol9-10.csv")

In [33]:
#Create df to hold the predicted values across iterations
df_esm_ohllr_exp = df_ms[["full_var"]].copy()
df_esm_ohllr_mig = df_ms[["full_var"]].copy()
df_esm_ohllr_prol = df_ms[["full_var"]].copy()

for i in range(100):
    # Random sampling + OHE 
    df1, df2, train_x, test_x = run_ridge_prep(df_ohe_all, 2, "ESM1b_score")
    # Linear regression for expression, then migration, then proliferation, adding pred values to dfs  
    iteration_name = f"pred_y_{i+900}"
    pred_y_exp = run_ridge(train_x, df1["Expr_z_score"].values, test_x, df2["Expr_z_score"].values)
    df_temp_exp = df2[["full_var"]].copy()
    df_temp_exp[iteration_name] = pred_y_exp
    df_esm_ohllr_exp = df_esm_ohllr_exp.merge(df_temp_exp, on = "full_var", how = "left")
    
    pred_y_mig = run_ridge(train_x, df1["Migr_z_score"].values, test_x, df2["Migr_z_score"].values)
    df_temp_mig = df2[["full_var"]].copy()
    df_temp_mig[iteration_name] = pred_y_mig
    df_esm_ohllr_mig = df_esm_ohllr_mig.merge(df_temp_mig, on = "full_var", how = "left")
    
    pred_y_prol = run_ridge(train_x, df1["Prolif_z_score"].values, test_x, df2["Prolif_z_score"].values)
    df_temp_prol = df2[["full_var"]].copy()
    df_temp_prol[iteration_name] = pred_y_prol
    df_esm_ohllr_prol = df_esm_ohllr_prol.merge(df_temp_prol, on = "full_var", how = "left")
    
    print(i)


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [34]:
df_esm_ohllr_exp.to_csv("df_esm_ohllr_exp10.csv")
df_esm_ohllr_mig.to_csv("df_esm_ohllr_mig10.csv")
df_esm_ohllr_prol.to_csv("df_esm_ohllr_prol10.csv")

In [9]:
#Create df to hold the predicted values across iterations
df_am_ohllr_exp = df_ms[["full_var"]].copy()
df_am_ohllr_mig = df_ms[["full_var"]].copy()
df_am_ohllr_prol = df_ms[["full_var"]].copy()

for i in range(500):
    # Random sampling + OHE 
    df1, df2, train_x, test_x = run_ridge_prep(df_ohe_all, 2, "am_pathogenicity_quant")
    # Linear regression for expression, then migration, then proliferation, adding pred values to dfs  
    iteration_name = f"pred_y_{i+500}"
    pred_y_exp = run_ridge(train_x, df1["Expr_z_score"].values, test_x, df2["Expr_z_score"].values)
    df_temp_exp = df2[["full_var"]].copy()
    df_temp_exp[iteration_name] = pred_y_exp
    df_am_ohllr_exp = df_am_ohllr_exp.merge(df_temp_exp, on = "full_var", how = "left")
    
    pred_y_mig = run_ridge(train_x, df1["Migr_z_score"].values, test_x, df2["Migr_z_score"].values)
    df_temp_mig = df2[["full_var"]].copy()
    df_temp_mig[iteration_name] = pred_y_mig
    df_am_ohllr_mig = df_am_ohllr_mig.merge(df_temp_mig, on = "full_var", how = "left")
    
    pred_y_prol = run_ridge(train_x, df1["Prolif_z_score"].values, test_x, df2["Prolif_z_score"].values)
    df_temp_prol = df2[["full_var"]].copy()
    df_temp_prol[iteration_name] = pred_y_prol
    df_am_ohllr_prol = df_am_ohllr_prol.merge(df_temp_prol, on = "full_var", how = "left")
    
    print(i)


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [10]:
df_am_ohllr_exp.to_csv("df_amquant_ohllr_exp6-10.csv")
df_am_ohllr_mig.to_csv("df_amquant_ohllr_mig6-10.csv")
df_am_ohllr_prol.to_csv("df_amquant_ohllr_prol6-10.csv")

In [24]:
#Create df to hold the predicted values across iterations
df_am_ohllr_exp = df_ms[["full_var"]].copy()
df_am_ohllr_mig = df_ms[["full_var"]].copy()
df_am_ohllr_prol = df_ms[["full_var"]].copy()

for i in range(900):
    # Random sampling + OHE; 4 variants per position rather than 2  
    df1, df2, train_x, test_x = run_ridge_prep(df_ohe_all, 4, "am_pathogenicity_quant")
    # Linear regression for expression, then migration, then proliferation, adding pred values to dfs  
    iteration_name = f"pred_y_{i+100}"
    pred_y_exp = run_ridge(train_x, df1["Expr_z_score"].values, test_x, df2["Expr_z_score"].values)
    df_temp_exp = df2[["full_var"]].copy()
    df_temp_exp[iteration_name] = pred_y_exp
    df_am_ohllr_exp = df_am_ohllr_exp.merge(df_temp_exp, on = "full_var", how = "left")
    
    pred_y_mig = run_ridge(train_x, df1["Migr_z_score"].values, test_x, df2["Migr_z_score"].values)
    df_temp_mig = df2[["full_var"]].copy()
    df_temp_mig[iteration_name] = pred_y_mig
    df_am_ohllr_mig = df_am_ohllr_mig.merge(df_temp_mig, on = "full_var", how = "left")
    
    pred_y_prol = run_ridge(train_x, df1["Prolif_z_score"].values, test_x, df2["Prolif_z_score"].values)
    df_temp_prol = df2[["full_var"]].copy()
    df_temp_prol[iteration_name] = pred_y_prol
    df_am_ohllr_prol = df_am_ohllr_prol.merge(df_temp_prol, on = "full_var", how = "left")
    
    print(i)


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [25]:
df_am_ohllr_exp.to_csv("df_amquant_ohllr4_exp2-10.csv")
df_am_ohllr_mig.to_csv("df_amquant_ohllr4_mig2-10.csv")
df_am_ohllr_prol.to_csv("df_amquant_ohllr4_prol2-10.csv")

In [8]:
#Create df to hold the predicted values across iterations
df_am_ohllr_exp = df_ms[["full_var"]].copy()
df_am_ohllr_mig = df_ms[["full_var"]].copy()
df_am_ohllr_prol = df_ms[["full_var"]].copy()

for i in range(100):
    # Random sampling + OHE; 8 variants per position rather than 2  
    df1, df2, train_x, test_x = run_ridge_prep(df_ohe_all, 8, "am_pathogenicity_quant")
    # Linear regression for expression, then migration, then proliferation, adding pred values to dfs  
    iteration_name = f"pred_y_{i}"
    pred_y_exp = run_ridge(train_x, df1["Expr_z_score"].values, test_x, df2["Expr_z_score"].values)
    df_temp_exp = df2[["full_var"]].copy()
    df_temp_exp[iteration_name] = pred_y_exp
    df_am_ohllr_exp = df_am_ohllr_exp.merge(df_temp_exp, on = "full_var", how = "left")
    
    pred_y_mig = run_ridge(train_x, df1["Migr_z_score"].values, test_x, df2["Migr_z_score"].values)
    df_temp_mig = df2[["full_var"]].copy()
    df_temp_mig[iteration_name] = pred_y_mig
    df_am_ohllr_mig = df_am_ohllr_mig.merge(df_temp_mig, on = "full_var", how = "left")
    
    pred_y_prol = run_ridge(train_x, df1["Prolif_z_score"].values, test_x, df2["Prolif_z_score"].values)
    df_temp_prol = df2[["full_var"]].copy()
    df_temp_prol[iteration_name] = pred_y_prol
    df_am_ohllr_prol = df_am_ohllr_prol.merge(df_temp_prol, on = "full_var", how = "left")
    
    print(i)


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [9]:
df_am_ohllr_exp.to_csv("df_amquant_ohllr8_exp1.csv")
df_am_ohllr_mig.to_csv("df_amquant_ohllr8_mig1.csv")
df_am_ohllr_prol.to_csv("df_amquant_ohllr8_prol1.csv")

In [10]:
#Create df to hold the predicted values across iterations
df_am_ohllr_exp = df_ms[["full_var"]].copy()
df_am_ohllr_mig = df_ms[["full_var"]].copy()
df_am_ohllr_prol = df_ms[["full_var"]].copy()

for i in range(100):
    # Random sampling + OHE; 12 variants per position rather than 2
    df1, df2, train_x, test_x = run_ridge_prep(df_ohe_all, 12, "am_pathogenicity_quant")
    # Linear regression for expression, then migration, then proliferation, adding pred values to dfs  
    iteration_name = f"pred_y_{i}"
    pred_y_exp = run_ridge(train_x, df1["Expr_z_score"].values, test_x, df2["Expr_z_score"].values)
    df_temp_exp = df2[["full_var"]].copy()
    df_temp_exp[iteration_name] = pred_y_exp
    df_am_ohllr_exp = df_am_ohllr_exp.merge(df_temp_exp, on = "full_var", how = "left")
    
    pred_y_mig = run_ridge(train_x, df1["Migr_z_score"].values, test_x, df2["Migr_z_score"].values)
    df_temp_mig = df2[["full_var"]].copy()
    df_temp_mig[iteration_name] = pred_y_mig
    df_am_ohllr_mig = df_am_ohllr_mig.merge(df_temp_mig, on = "full_var", how = "left")
    
    pred_y_prol = run_ridge(train_x, df1["Prolif_z_score"].values, test_x, df2["Prolif_z_score"].values)
    df_temp_prol = df2[["full_var"]].copy()
    df_temp_prol[iteration_name] = pred_y_prol
    df_am_ohllr_prol = df_am_ohllr_prol.merge(df_temp_prol, on = "full_var", how = "left")
    
    print(i)


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [11]:
df_am_ohllr_exp.to_csv("df_amquant_ohllr12_exp1.csv")
df_am_ohllr_mig.to_csv("df_amquant_ohllr12_mig1.csv")
df_am_ohllr_prol.to_csv("df_amquant_ohllr12_prol1.csv")

In [14]:
#Create df to hold the predicted values across iterations
df_am_ohllr_exp = df_ms[["full_var"]].copy()
df_am_ohllr_mig = df_ms[["full_var"]].copy()
df_am_ohllr_prol = df_ms[["full_var"]].copy()

for i in range(100):
    # Random sampling + OHE; 16 variants per position rather than 2
    df1, df2, train_x, test_x = run_ridge_prep(df_ohe_all, 16, "am_pathogenicity_quant")
    # Linear regression for expression, then migration, then proliferation, adding pred values to dfs  
    iteration_name = f"pred_y_{i}"
    pred_y_exp = run_ridge(train_x, df1["Expr_z_score"].values, test_x, df2["Expr_z_score"].values)
    df_temp_exp = df2[["full_var"]].copy()
    df_temp_exp[iteration_name] = pred_y_exp
    df_am_ohllr_exp = df_am_ohllr_exp.merge(df_temp_exp, on = "full_var", how = "left")
    
    pred_y_mig = run_ridge(train_x, df1["Migr_z_score"].values, test_x, df2["Migr_z_score"].values)
    df_temp_mig = df2[["full_var"]].copy()
    df_temp_mig[iteration_name] = pred_y_mig
    df_am_ohllr_mig = df_am_ohllr_mig.merge(df_temp_mig, on = "full_var", how = "left")
    
    pred_y_prol = run_ridge(train_x, df1["Prolif_z_score"].values, test_x, df2["Prolif_z_score"].values)
    df_temp_prol = df2[["full_var"]].copy()
    df_temp_prol[iteration_name] = pred_y_prol
    df_am_ohllr_prol = df_am_ohllr_prol.merge(df_temp_prol, on = "full_var", how = "left")
    
    print(i)


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [15]:
df_am_ohllr_exp.to_csv("df_amquant_ohllr16_exp1.csv")
df_am_ohllr_mig.to_csv("df_amquant_ohllr16_mig1.csv")
df_am_ohllr_prol.to_csv("df_amquant_ohllr16_prol1.csv")

In [6]:
#Create df to hold the predicted values across iterations
df_esm_ohllr_exp = df_ms[["full_var"]].copy()
df_esm_ohllr_mig = df_ms[["full_var"]].copy()
df_esm_ohllr_prol = df_ms[["full_var"]].copy()

for i in range(900):
    # Random sampling + OHE 
    df1, df2, train_x, test_x = run_ridge_prep(df_ohe_all, 2, "ESM1b_score_quant")
    # Linear regression for expression, then migration, then proliferation, adding pred values to dfs  
    iteration_name = f"pred_y_{i+100}"
    pred_y_exp = run_ridge(train_x, df1["Expr_z_score"].values, test_x, df2["Expr_z_score"].values)
    df_temp_exp = df2[["full_var"]].copy()
    df_temp_exp[iteration_name] = pred_y_exp
    df_esm_ohllr_exp = df_esm_ohllr_exp.merge(df_temp_exp, on = "full_var", how = "left")
    
    pred_y_mig = run_ridge(train_x, df1["Migr_z_score"].values, test_x, df2["Migr_z_score"].values)
    df_temp_mig = df2[["full_var"]].copy()
    df_temp_mig[iteration_name] = pred_y_mig
    df_esm_ohllr_mig = df_esm_ohllr_mig.merge(df_temp_mig, on = "full_var", how = "left")
    
    pred_y_prol = run_ridge(train_x, df1["Prolif_z_score"].values, test_x, df2["Prolif_z_score"].values)
    df_temp_prol = df2[["full_var"]].copy()
    df_temp_prol[iteration_name] = pred_y_prol
    df_esm_ohllr_prol = df_esm_ohllr_prol.merge(df_temp_prol, on = "full_var", how = "left")
    
    print(i)


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [7]:
df_esm_ohllr_exp.to_csv("df_esmquant_ohllr_exp2-10.csv")
df_esm_ohllr_mig.to_csv("df_esmquant_ohllr_mig2-10.csv")
df_esm_ohllr_prol.to_csv("df_esmquant_ohllr_prol2-10.csv")

In [12]:
#Create df to hold the predicted values across iterations
df_esm_ohllr_exp = df_ms[["full_var"]].copy()
df_esm_ohllr_mig = df_ms[["full_var"]].copy()
df_esm_ohllr_prol = df_ms[["full_var"]].copy()

for i in range(100):
    # Random sampling + OHE, 4 variants per position rather than 2 
    df1, df2, train_x, test_x = run_ridge_prep(df_ohe_all, 4, "ESM1b_score_quant")
    # Linear regression for expression, then migration, then proliferation, adding pred values to dfs  
    iteration_name = f"pred_y_{i}"
    pred_y_exp = run_ridge(train_x, df1["Expr_z_score"].values, test_x, df2["Expr_z_score"].values)
    df_temp_exp = df2[["full_var"]].copy()
    df_temp_exp[iteration_name] = pred_y_exp
    df_esm_ohllr_exp = df_esm_ohllr_exp.merge(df_temp_exp, on = "full_var", how = "left")
    
    pred_y_mig = run_ridge(train_x, df1["Migr_z_score"].values, test_x, df2["Migr_z_score"].values)
    df_temp_mig = df2[["full_var"]].copy()
    df_temp_mig[iteration_name] = pred_y_mig
    df_esm_ohllr_mig = df_esm_ohllr_mig.merge(df_temp_mig, on = "full_var", how = "left")
    
    pred_y_prol = run_ridge(train_x, df1["Prolif_z_score"].values, test_x, df2["Prolif_z_score"].values)
    df_temp_prol = df2[["full_var"]].copy()
    df_temp_prol[iteration_name] = pred_y_prol
    df_esm_ohllr_prol = df_esm_ohllr_prol.merge(df_temp_prol, on = "full_var", how = "left")
    
    print(i)


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [13]:
df_esm_ohllr_exp.to_csv("df_esmquant4_ohllr_exp1.csv")
df_esm_ohllr_mig.to_csv("df_esmquant4_ohllr_mig1.csv")
df_esm_ohllr_prol.to_csv("df_esmquant4_ohllr_prol1.csv")

In [16]:
#Create df to hold the predicted values across iterations
df_esm_ohllr_exp = df_ms[["full_var"]].copy()
df_esm_ohllr_mig = df_ms[["full_var"]].copy()
df_esm_ohllr_prol = df_ms[["full_var"]].copy()

for i in range(100):
    # Random sampling + OHE, 8 variants per position rather than 2 
    df1, df2, train_x, test_x = run_ridge_prep(df_ohe_all, 8, "ESM1b_score_quant")
    # Linear regression for expression, then migration, then proliferation, adding pred values to dfs  
    iteration_name = f"pred_y_{i}"
    pred_y_exp = run_ridge(train_x, df1["Expr_z_score"].values, test_x, df2["Expr_z_score"].values)
    df_temp_exp = df2[["full_var"]].copy()
    df_temp_exp[iteration_name] = pred_y_exp
    df_esm_ohllr_exp = df_esm_ohllr_exp.merge(df_temp_exp, on = "full_var", how = "left")
    
    pred_y_mig = run_ridge(train_x, df1["Migr_z_score"].values, test_x, df2["Migr_z_score"].values)
    df_temp_mig = df2[["full_var"]].copy()
    df_temp_mig[iteration_name] = pred_y_mig
    df_esm_ohllr_mig = df_esm_ohllr_mig.merge(df_temp_mig, on = "full_var", how = "left")
    
    pred_y_prol = run_ridge(train_x, df1["Prolif_z_score"].values, test_x, df2["Prolif_z_score"].values)
    df_temp_prol = df2[["full_var"]].copy()
    df_temp_prol[iteration_name] = pred_y_prol
    df_esm_ohllr_prol = df_esm_ohllr_prol.merge(df_temp_prol, on = "full_var", how = "left")
    
    print(i)


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [17]:
df_esm_ohllr_exp.to_csv("df_esmquant8_ohllr_exp1.csv")
df_esm_ohllr_mig.to_csv("df_esmquant8_ohllr_mig1.csv")
df_esm_ohllr_prol.to_csv("df_esmquant8_ohllr_prol1.csv")

In [18]:
#Create df to hold the predicted values across iterations
df_esm_ohllr_exp = df_ms[["full_var"]].copy()
df_esm_ohllr_mig = df_ms[["full_var"]].copy()
df_esm_ohllr_prol = df_ms[["full_var"]].copy()

for i in range(100):
    # Random sampling + OHE, 12 variants per position rather than 2 
    df1, df2, train_x, test_x = run_ridge_prep(df_ohe_all, 12, "ESM1b_score_quant")
    # Linear regression for expression, then migration, then proliferation, adding pred values to dfs  
    iteration_name = f"pred_y_{i}"
    pred_y_exp = run_ridge(train_x, df1["Expr_z_score"].values, test_x, df2["Expr_z_score"].values)
    df_temp_exp = df2[["full_var"]].copy()
    df_temp_exp[iteration_name] = pred_y_exp
    df_esm_ohllr_exp = df_esm_ohllr_exp.merge(df_temp_exp, on = "full_var", how = "left")
    
    pred_y_mig = run_ridge(train_x, df1["Migr_z_score"].values, test_x, df2["Migr_z_score"].values)
    df_temp_mig = df2[["full_var"]].copy()
    df_temp_mig[iteration_name] = pred_y_mig
    df_esm_ohllr_mig = df_esm_ohllr_mig.merge(df_temp_mig, on = "full_var", how = "left")
    
    pred_y_prol = run_ridge(train_x, df1["Prolif_z_score"].values, test_x, df2["Prolif_z_score"].values)
    df_temp_prol = df2[["full_var"]].copy()
    df_temp_prol[iteration_name] = pred_y_prol
    df_esm_ohllr_prol = df_esm_ohllr_prol.merge(df_temp_prol, on = "full_var", how = "left")
    
    print(i)


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [19]:
df_esm_ohllr_exp.to_csv("df_esmquant12_ohllr_exp1.csv")
df_esm_ohllr_mig.to_csv("df_esmquant12_ohllr_mig1.csv")
df_esm_ohllr_prol.to_csv("df_esmquant12_ohllr_prol1.csv")

In [20]:
#Create df to hold the predicted values across iterations
df_esm_ohllr_exp = df_ms[["full_var"]].copy()
df_esm_ohllr_mig = df_ms[["full_var"]].copy()
df_esm_ohllr_prol = df_ms[["full_var"]].copy()

for i in range(100):
    # Random sampling + OHE, 16 variants per position rather than 2 
    df1, df2, train_x, test_x = run_ridge_prep(df_ohe_all, 16, "ESM1b_score_quant")
    # Linear regression for expression, then migration, then proliferation, adding pred values to dfs  
    iteration_name = f"pred_y_{i}"
    pred_y_exp = run_ridge(train_x, df1["Expr_z_score"].values, test_x, df2["Expr_z_score"].values)
    df_temp_exp = df2[["full_var"]].copy()
    df_temp_exp[iteration_name] = pred_y_exp
    df_esm_ohllr_exp = df_esm_ohllr_exp.merge(df_temp_exp, on = "full_var", how = "left")
    
    pred_y_mig = run_ridge(train_x, df1["Migr_z_score"].values, test_x, df2["Migr_z_score"].values)
    df_temp_mig = df2[["full_var"]].copy()
    df_temp_mig[iteration_name] = pred_y_mig
    df_esm_ohllr_mig = df_esm_ohllr_mig.merge(df_temp_mig, on = "full_var", how = "left")
    
    pred_y_prol = run_ridge(train_x, df1["Prolif_z_score"].values, test_x, df2["Prolif_z_score"].values)
    df_temp_prol = df2[["full_var"]].copy()
    df_temp_prol[iteration_name] = pred_y_prol
    df_esm_ohllr_prol = df_esm_ohllr_prol.merge(df_temp_prol, on = "full_var", how = "left")
    
    print(i)


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [21]:
df_esm_ohllr_exp.to_csv("df_esmquant16_ohllr_exp1.csv")
df_esm_ohllr_mig.to_csv("df_esmquant16_ohllr_mig1.csv")
df_esm_ohllr_prol.to_csv("df_esmquant16_ohllr_prol1.csv")

In [14]:
#New column with logit transformed am pathogenicity values
df_ms2 = df_ms
df_ms2['logit_am_path'] = np.log(df_ms2['am_pathogenicity'] / (1 - df_ms2['am_pathogenicity']))

def ohe_all_unnormed(df):
    # pull variant sequences and one hot encode
    ls_df = df.loc[:,"full_var_seq"]
    df_temp1 = format_batch_seqs(ls_df)
    df_temp2 = seqs_to_onehot(df_temp1)
    # pull columns that I want to keep from df_ms1
    df_ohe = np.column_stack((df["pos"].values, df["full_var"].values, df["wt_aa"].values, df["var_aa"].values, df["am_pathogenicity"].values, df["ESM1b_score"].values,
                             df["Expr_z_score"].values, df["Migr_z_score"].values, df["Prolif_z_score"].values, df["logit_am_path"]))
   # combine everything together and return as df
    df_ohe = np.column_stack((df_ohe, df_temp2))  
    column_names = ["pos", "full_var", "wt_aa", "var_aa", "am_pathogenicity", "ESM1b_score", "Expr_z_score", "Migr_z_score", "Prolif_z_score", "logit_am_path"] + ['feature_' + str(i) for i in range(df_temp2.shape[1])]
    df_final = pd.DataFrame(df_ohe, columns=column_names)
    return df_final


In [15]:
# Regression with seqs one-hot + am pathogenicity 
# One hot encode all the sequences
df_ohe_logit_all = ohe_all_unnormed(df_ms2)


In [18]:
#Create df to hold the predicted values across iterations
df_am_ohllr_exp = df_ms2[["full_var"]].copy()
df_am_ohllr_mig = df_ms2[["full_var"]].copy()
df_am_ohllr_prol = df_ms2[["full_var"]].copy()

for i in range(100):
    # Random sampling + OHE 
    df1, df2, train_x, test_x = run_ridge_prep(df_ohe_logit_all, 2, "logit_am_path")
    # Linear regression for expression, then migration, then proliferation, adding pred values to dfs  
    iteration_name = f"pred_y_{i}"
    pred_y_exp = run_ridge(train_x, df1["Expr_z_score"].values, test_x, df2["Expr_z_score"].values)
    df_temp_exp = df2[["full_var"]].copy()
    df_temp_exp[iteration_name] = pred_y_exp
    df_am_ohllr_exp = df_am_ohllr_exp.merge(df_temp_exp, on = "full_var", how = "left")
    
    pred_y_mig = run_ridge(train_x, df1["Migr_z_score"].values, test_x, df2["Migr_z_score"].values)
    df_temp_mig = df2[["full_var"]].copy()
    df_temp_mig[iteration_name] = pred_y_mig
    df_am_ohllr_mig = df_am_ohllr_mig.merge(df_temp_mig, on = "full_var", how = "left")
    
    pred_y_prol = run_ridge(train_x, df1["Prolif_z_score"].values, test_x, df2["Prolif_z_score"].values)
    df_temp_prol = df2[["full_var"]].copy()
    df_temp_prol[iteration_name] = pred_y_prol
    df_am_ohllr_prol = df_am_ohllr_prol.merge(df_temp_prol, on = "full_var", how = "left")
    
    print(i)


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [20]:
df_am_ohllr_exp.to_csv("df_amlogit_ohllr_exp1.csv")
df_am_ohllr_mig.to_csv("df_amlogit_ohllr_mig1.csv")
df_am_ohllr_prol.to_csv("df_amlogit_ohllr_prol1.csv")

In [22]:
def run_ridge_prep_oheonly (df_x, k):
    #Group the dataframe by pos and use split_data_frame function to randomly sample k rows (df1), df2 remainder 
    grouped = df_x.groupby('pos').apply(lambda x: split_dataframe_random(x, k))
    # Initialize empty dataframes for storing the results
    df1 = pd.DataFrame()
    df2 = pd.DataFrame()
    for _, (df_sample, remaining_df) in grouped.items():
        df1 = pd.concat([df1, df_sample])
        df2 = pd.concat([df2, remaining_df])
    oh1 = df1.iloc[:, 11:]
    oh2 = df2.iloc[:, 11:]
    ## modified below from standard run_ridge_prep, no longer incorporating AM pathogenicity or the like 
    train_x = oh1
    test_x = oh2
    return df1, df2, train_x, test_x

#Create df to hold the predicted values across iterations
df_am_ohllr_exp = df_ms[["full_var"]].copy()
df_am_ohllr_mig = df_ms[["full_var"]].copy()
df_am_ohllr_prol = df_ms[["full_var"]].copy()

for i in range(100):
    # Random sampling + OHE 
    df1, df2, train_x, test_x = run_ridge_prep_oheonly(df_ohe_all, 2)
    # Linear regression for expression, then migration, then proliferation, adding pred values to dfs  
    iteration_name = f"pred_y_{i}"
    pred_y_exp = run_ridge(train_x, df1["Expr_z_score"].values, test_x, df2["Expr_z_score"].values)
    df_temp_exp = df2[["full_var"]].copy()
    df_temp_exp[iteration_name] = pred_y_exp
    df_am_ohllr_exp = df_am_ohllr_exp.merge(df_temp_exp, on = "full_var", how = "left")
    
    pred_y_mig = run_ridge(train_x, df1["Migr_z_score"].values, test_x, df2["Migr_z_score"].values)
    df_temp_mig = df2[["full_var"]].copy()
    df_temp_mig[iteration_name] = pred_y_mig
    df_am_ohllr_mig = df_am_ohllr_mig.merge(df_temp_mig, on = "full_var", how = "left")
    
    pred_y_prol = run_ridge(train_x, df1["Prolif_z_score"].values, test_x, df2["Prolif_z_score"].values)
    df_temp_prol = df2[["full_var"]].copy()
    df_temp_prol[iteration_name] = pred_y_prol
    df_am_ohllr_prol = df_am_ohllr_prol.merge(df_temp_prol, on = "full_var", how = "left")
    
    print(i)


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [23]:
df_am_ohllr_exp.to_csv("df_am_ohe_exp1.csv")
df_am_ohllr_mig.to_csv("df_am_ohe_mig1.csv")
df_am_ohllr_prol.to_csv("df_am_ohe_prol1.csv")

In [7]:
#New df with the averages from the iterations
#df_ms = df_ms.reset_index()
#df_am_ohllr_all1 = df_ms[["full_var"]].copy()
#df_am_ohllr_all1["AM_expr_pred"] = df_am_ohllr_exp.iloc[:,1:4].mean(axis=1)
#df_am_ohllr_all1["AM_migr_pred"] = df_am_ohllr_mig.iloc[:,1:4].mean(axis=1)
#df_am_ohllr_all1["AM_prolif_pred"] = df_am_ohllr_prol.iloc[:,1:4].mean(axis=1)

In [14]:
df_am_ohllr_exp


Unnamed: 0,full_var,pred_y_0,pred_y_1,pred_y_2,pred_y_3,pred_y_4,pred_y_5,pred_y_6,pred_y_7,pred_y_8,...,pred_y_90,pred_y_91,pred_y_92,pred_y_93,pred_y_94,pred_y_95,pred_y_96,pred_y_97,pred_y_98,pred_y_99
0,Q2K,0.335646,0.635340,0.141072,0.224323,0.275930,0.492186,0.539792,0.278752,0.857630,...,0.731669,1.270759,,0.651267,0.836074,0.892014,,0.556771,0.852460,0.635512
1,Q2R,0.394582,,,0.280856,0.334556,,0.600807,0.338413,0.914320,...,0.790348,1.323214,0.301831,0.707054,0.891776,0.948389,0.226279,0.614881,0.911297,0.691873
2,Q2H,0.199531,0.500259,0.011699,0.093760,0.140531,0.356071,,0.140965,0.726704,...,0.596149,1.149613,0.116032,,0.707429,0.761814,0.039459,0.422565,0.716575,0.505344
3,Q2E,0.420308,0.719359,0.221542,0.305533,0.360147,0.576850,0.627441,0.364455,0.939065,...,,1.346111,0.326336,0.731406,0.916090,0.972997,0.250920,,0.936980,0.716475
4,Q2D,0.198128,0.498867,0.010365,0.092414,0.139135,0.354668,0.397423,0.139545,,...,0.594751,1.148364,0.114695,0.521097,0.706102,0.760472,0.038115,0.421181,0.715174,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6702,F359I,-0.661054,-1.103762,,-0.521856,-0.712657,-0.480838,-0.875063,-0.606438,-0.658648,...,-0.580027,-0.554503,-0.817235,-0.369801,-0.421605,-0.336292,-0.331555,,-0.695124,-0.192389
6703,F359M,-1.261642,-1.699787,-1.557662,-1.097950,-1.310086,-1.081431,-1.496837,-1.214405,-1.236343,...,-1.177994,-1.089044,-1.389335,-0.938298,-0.989234,-0.910780,-0.906800,-1.599542,-1.294700,-0.766738
6704,F359P,-1.597953,-2.033542,-1.877318,-1.420545,-1.644628,-1.417744,-1.845012,-1.554848,-1.559834,...,-1.512837,-1.388370,-1.709693,,-1.307089,-1.232476,,-1.931138,-1.630444,
6705,F359Y,1.092065,,0.679479,1.159765,1.031240,1.272294,0.939898,1.168221,1.027644,...,1.165440,1.005825,0.852726,1.289645,1.235307,1.340641,1.347590,0.721164,1.055040,1.484140


In [16]:
for i in range(10):
    string = f"pred_y_{i+100}"
    print(string)

pred_y_100
pred_y_101
pred_y_102
pred_y_103
pred_y_104
pred_y_105
pred_y_106
pred_y_107
pred_y_108
pred_y_109


In [17]:
df_ohe_logit_all


Unnamed: 0,pos,full_var,wt_aa,var_aa,am_pathogenicity,ESM1b_score,Expr_z_score,Migr_z_score,Prolif_z_score,logit_am_path,...,feature_8606,feature_8607,feature_8608,feature_8609,feature_8610,feature_8611,feature_8612,feature_8613,feature_8614,feature_8615
0,2,Q2K,Q,K,0.0868,-4.919,-1.464268,0.41329,0.299999,-2.353348,...,0,0,0,0,1,0,0,0,0,0
1,2,Q2R,Q,R,0.0742,-3.915,-0.228183,-0.668042,0.709616,-2.523894,...,0,0,0,0,1,0,0,0,0,0
2,2,Q2H,Q,H,0.1159,-4.331,0.430181,-0.829473,-0.09298,-2.031842,...,0,0,0,0,1,0,0,0,0,0
3,2,Q2E,Q,E,0.0687,-1.64,1.027485,-0.728391,-0.677382,-2.606832,...,0,0,0,0,1,0,0,0,0,0
4,2,Q2D,Q,D,0.1162,-2.215,1.271945,-1.463108,-0.864939,-2.028918,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6702,359,F359I,F,I,0.6959,-5.377,0.329759,0.18287,-1.074683,0.827849,...,0,0,0,1,0,0,0,0,0,0
6703,359,F359M,F,M,0.8243,-5.422,0.56399,-1.152813,-1.426938,1.545757,...,0,0,0,0,0,0,0,0,0,0
6704,359,F359P,F,P,0.8962,-6.241,1.228805,-1.285548,-0.505204,2.155698,...,1,0,0,0,0,0,0,0,0,0
6705,359,F359Y,F,Y,0.3211,-7.047,0.970892,0.100578,1.512123,-0.748721,...,0,0,0,0,0,1,0,0,0,0


In [4]:
import numpy as np
from sklearn.preprocessing import QuantileTransformer
from scipy import stats
from sklearn.linear_model import Ridge
import pandas as pd


In [22]:
import sys
print("Python version:", sys.version)
import pkg_resources

# List of package names you want to check versions for
packages = ['numpy', 'pandas', 'scipy', 'scikit']

# Print each package's version
for package in packages:
    version = pkg_resources.get_distribution(package).version
    print(f"{package} version: {version}")
    
print("Scikit-learn version:", sklearn.__version__)

Python version: 3.11.4 | packaged by Anaconda, Inc. | (main, Jul  5 2023, 13:38:37) [MSC v.1916 64 bit (AMD64)]
numpy version: 1.24.3
pandas version: 1.5.3
scipy version: 1.10.1


DistributionNotFound: The 'scikit' distribution was not found and is required by the application