In [1]:
import os
import pandas as pd

# Path to the main directory
main_dir = "code-davinci-002/"

# List to store dataframes
dataframes = []

# Iterate over each subdirectory and process files
for sub_dir, _, files in os.walk(main_dir):
    if sub_dir == main_dir:
        continue  # Skip the main directory
    
    # List to store dataframes for the current subdirectory
    sub_dataframes = []
    
    for file in files:
        if file.endswith('.csv'):
            # Read CSV file
            file_path = os.path.join(sub_dir, file)
            df = pd.read_csv(file_path)
            
            # Append the dataframe to the list for the current subdirectory
            sub_dataframes.append(df)
    
    if sub_dataframes:
        # Concatenate dataframes within the current subdirectory
        sub_df = pd.concat(sub_dataframes, ignore_index=True)
        
        # Add a new column 'Name' with the folder name
        folder_name = os.path.basename(sub_dir)
        sub_df['Name'] = folder_name
        
        # Append the subdirectory dataframe to the main list
        dataframes.append(sub_df)

# Print or inspect the list of dataframes
# for df in dataframes:
#     print(f"Dataframe for folder: {df['Name'].iloc[0]}")
#     print(df.head())
#     print()

In [2]:
# Concatenate all dataframes into a single dataframe
final_df_1 = pd.concat(dataframes, ignore_index=True)

# Add a new column 'model' with the value 'code-davinci-002'
final_df_1['model'] = 'code-davinci-002'

In [3]:
final_df_1['Final Answer_0'].value_counts()[:20]

Final Answer_0
no       2348
yes      2173
(A)      1389
(B)      1381
(C)       922
True      550
(D)       428
False     339
2.0       232
2.0       190
4.0       184
5.0       184
7.0       176
1.0       176
3.0       169
8.0       159
(E)       158
4.0       157
8.0       155
10.0      148
Name: count, dtype: int64

In [6]:
final_df_1['Final Answer_0'].value_counts()[-20:] # 

Final Answer_0
9860.78               1
7.875                 1
1680.0                1
736.0                 1
138.91500000000002    1
22.22222222222222     1
1190.0                1
-140.0                1
820.0                 1
-8.4                  1
[4, 6.0]              1
16.666666666666668    1
525.0                 1
11.38888888888889     1
67.5                  1
20.000000000000004    1
324.0                 1
27.333333333333332    1
113.66666666666667    1
either (A) or (D)     1
Name: count, dtype: int64

In [40]:
final_df_1['Final Answer_0'].value_counts().index[8]

'2.0'

In [41]:
final_df_1['Final Answer_0'].value_counts().index[9]

2.0

In [42]:
def clean_answer_columns(df):
    # List of column names to clean
    answer_columns = [f"Final Answer_{i}" for i in range(40)] + ["Correct Answer"]

    # Iterate over each answer column
    for column in answer_columns:
        # Convert the column to string type
        df[column] = df[column].astype(str)

    return df

final_df_1 = clean_answer_columns(final_df_1)
final_df_1['Final Answer_0'].value_counts()[:20]

Final Answer_0
no       2348
yes      2173
(A)      1389
(B)      1381
(C)       922
nan       829
(D)       428
True      425
2.0       422
4.0       341
5.0       332
7.0       323
8.0       314
3.0       302
1.0       301
False     300
6.0       284
10.0      281
12.0      270
9.0       249
Name: count, dtype: int64

In [43]:
main_dir = "vicuna-13b"

# List to store dataframes
dataframes = []

# Iterate over each subdirectory and process files
for sub_dir, _, files in os.walk(main_dir):
    if sub_dir == main_dir:
        continue  # Skip the main directory
    
    # List to store dataframes for the current subdirectory
    sub_dataframes = []
    
    for file in files:
        if file.endswith('.csv'):
            # Read CSV file
            file_path = os.path.join(sub_dir, file)
            try:
                df = pd.read_csv(file_path)
            except:
                print(file)
                break
            
            # Append the dataframe to the list for the current subdirectory
            sub_dataframes.append(df)
    
    if sub_dataframes:
        # Concatenate dataframes within the current subdirectory
        sub_df = pd.concat(sub_dataframes, ignore_index=True)
        
        # Add a new column 'Name' with the folder name
        folder_name = os.path.basename(sub_dir)
        sub_df['Name'] = folder_name
        
        # Append the subdirectory dataframe to the main list
        dataframes.append(sub_df)

salient_translation_seed2_cleaned.csv


In [44]:
len(dataframes)

13

In [45]:
# Concatenate all dataframes into a single dataframe
final_df_2 = pd.concat(dataframes, ignore_index=True)

# Add a new column 'model' with the value 'code-davinci-002'
final_df_2['model'] = 'vicuna-13b'
final_df_2= clean_answer_columns(final_df_2)

In [47]:
final_df_2['Final Answer_0'].value_counts()[:20]

Final Answer_0
no       4106
nan      3584
yes      1748
(A)       983
(B)       821
(C)       725
True      343
2.0       319
4.0       302
7.0       285
5.0       279
8.0       273
(D)       254
6.0       254
3.0       248
0.0       247
12.0      235
False     235
9.0       232
1.0       231
Name: count, dtype: int64

In [48]:
final_df = pd.concat([final_df_1,final_df_2]).reset_index(drop=True)

In [50]:
final_df_test = final_df.sample(1000)

In [70]:
import re
import numpy as np
def extract_len(df):
    step_count_buffer = []
    for col in df:
        if col.startswith('CoT_'):
            cleaned_answers = []
            for entry in df[col]:
                entry_str = str(entry)
                # Count the number of new lines in the entry
                num_of_newlines = entry_str.count('\n')
                
                if num_of_newlines == 0:
                    # If no new lines, count the number of sentences
                    num_of_sentences = entry_str.count('.') + 1
                    cleaned_answers.append(num_of_sentences)
                else:
                    cleaned_answers.append(num_of_newlines)
            
            step_count_buffer.append(cleaned_answers)
    
    step_count = np.array(step_count_buffer).T
    return step_count

extract_len(final_df_test)

array([[12, 13, 12, ..., 12, 12, 12],
       [14,  6, 13, ..., 10, 11, 11],
       [13, 13, 13, ..., 13, 14, 13],
       ...,
       [ 3,  5,  5, ...,  3,  4,  4],
       [ 6,  6,  6, ...,  6,  5,  6],
       [13, 13, 13, ..., 12, 13, 13]])

In [62]:
(extract_len(final_df_test)[:,0]==0).sum()

57

In [68]:
final_df_test[extract_len(final_df_test)[:,0]==0]['CoT_0'].iloc[0]

'A leap year is every fourth year. The average lifespan of a hamster is about 3 years. 3 years is less than 4 years, so a hamster will not experience two leap years. The answer is no'

In [69]:
final_df_test[extract_len(final_df_test)[:,0]==0]['CoT_0'].iloc[1]

'The class of 2017 was born in 1995. 9/11 occurred in 2001. Thus, a student of the class of 2017 would not have amnesia about 9/11. The answer is no'