In [1]:
import pandas as pd


task = 'company data quality test - 3_8_2024 - tech companies'

evaluate_file = './data/' + task + '.csv'
evaluate_df = pd.read_csv(evaluate_file)

In [2]:
columns = ['Founding Year', 'Is Public', 'Last Funding Stage']
columns_wf = [col + ' (WF)' for col in columns]
compare = (evaluate_df[columns] == evaluate_df[columns_wf].to_numpy())


column = 'Total Funding'
column_wf = column + ' (WF)'
columns.append(column)
diff = abs(evaluate_df[column] - evaluate_df[column_wf]) / evaluate_df[column]
compare[column] = (diff <= 0.3)


def compare_size(row):
    size = row[column]
    size_range = row[column_wf]
    try:
        min_size = int(size_range.split('-')[0])
        max_size = int(size_range.split('-')[1])
    except:
        min_size = 10_000
        max_size = 10_000_000

    if pd.isna(size_range):
        return False
    elif size_range == '11-50' and (size < 0.5*min_size or size > 1.5*max_size):
        return False
    elif size_range == '1-10' and size > 2*max_size:
        return False
    elif size < 0.7*min_size or size > 1.3*max_size:
        return False
    else:
        return True

for year in ['2023','2022', '2021', '2020', '2015', '2010', '2005', '2000']:
    column = f'Size in {year}'
    column_wf = column + ' (WF)'
    columns.append(column)
    compare[column] = evaluate_df.apply(compare_size, axis=1)


compare = compare.astype(str)
compare[evaluate_df[columns].isna()] = ''
compare

Unnamed: 0,Founding Year,Is Public,Last Funding Stage,Total Funding,Size in 2023,Size in 2022,Size in 2021,Size in 2020,Size in 2015,Size in 2010,Size in 2005,Size in 2000
0,True,True,,,True,True,True,True,True,True,,
1,True,True,,,True,True,True,True,True,True,,
2,True,True,,,True,True,True,True,True,True,True,
3,True,True,,,True,True,True,True,True,True,,
4,True,True,,,True,True,True,True,True,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
95,True,True,False,True,True,True,True,,,,,
96,True,True,False,False,True,True,True,,,,,
97,True,True,True,True,True,True,True,,,,,
98,True,True,True,True,True,True,True,,,,,


In [3]:
count_true = compare.eq('True').sum()
count_false = compare.eq('False').sum()
result = pd.DataFrame(
    {
        'Correct': count_true,
        'Wrong': count_false,
        'Total': count_true.add(count_false),
        'Accuracy': count_true/count_true.add(count_false),
    }
)

result

Unnamed: 0,Correct,Wrong,Total,Accuracy
Founding Year,96,4,100,0.96
Is Public,93,7,100,0.93
Last Funding Stage,15,47,62,0.241935
Total Funding,27,34,61,0.442623
Size in 2023,89,11,100,0.89
Size in 2022,90,10,100,0.9
Size in 2021,84,16,100,0.84
Size in 2020,27,6,33,0.818182
Size in 2015,19,4,23,0.826087
Size in 2010,11,4,15,0.733333


In [None]:
result.to_csv('result.csv')