In [1]:
import pandas as pd
import pickle

In [2]:
# Let's first take stock of the situation
print(f'No indirect: There are {445 + 3928} after and {430 + 3868} before questions')
print(f'Any indirect: There are {11283 + 10309} after and {2488 + 19103} before questions')
print(f'Only indir obj: There are {11339 + 6640} after and {2174 + 15830} before questions')
print(f'The other two selections have far too small sample sizes')

print()

print(f'No indirect: Model makes {445 + 430} after and {3928 + 3868} before predictions')
print(f'Any indirect: Model makes {11283 + 2488} after and {10309 + 19103} before predictions')
print(f'Only indir obj: Model makes {11339 + 2174} after and {6640 + 15830} before preds')

No indirect: There are 4373 after and 4298 before questions
Any indirect: There are 21592 after and 21591 before questions
Only indir obj: There are 17979 after and 18004 before questions
The other two selections have far too small sample sizes

No indirect: Model makes 875 after and 7796 before predictions
Any indirect: Model makes 13771 after and 29412 before predictions
Only indir obj: Model makes 13513 after and 22470 before preds


In [3]:
# There are two questions to address
# Question 1: Within the training set used by these models, is there a bias that a blind model can exploit?
# Question 2: Do the biases vary between models?

In [4]:
# Question 1: First load the relevant dataframes

# PSAC
psac_tr = pd.read_csv('/vision/u/momergul/PSAC_2/code_file/data/dataset_balanced/Train_frameqa_question-balanced.csv')

# HME
hme_tr = pd.read_csv('/vision/u/momergul/HME-VideoQA/gif-qa/data/dataset_balanced/Train_frameqa_question-balanced.csv')
split = hme_tr.shape[0] - int(0.1 * hme_tr.shape[0])
hme_tr = hme_tr.loc[:split - 1, :]

# HCRN
hcrn_tr = pd.read_csv('/vision/u/momergul/hcrn-videoqa_2/csvs/Train_frameqa_question-balanced.csv')
split = int(0.9 * hcrn_tr.shape[0])
while (hcrn_tr.loc[split - 1, 'vid_id'] == hcrn_tr.loc[split, 'vid_id']):
    split = split + 1
hcrn_tr = hcrn_tr.loc[:split-1, :]

In [6]:
# Question 1: Check answer distribution per question for each
from collections import Counter

def check_biases(df, model_name):
    df_binaries = df[(df['answer'] == 'before') | (df['answer'] == 'after')]
    q_to_c = {}
        
    # Get mapping from question to number of before/after answers
    for row in df_binaries.iterrows():
        question, answer = row[1]['question'], row[1]['answer']
        
        if question not in q_to_c:
            q_to_c[question] = Counter()
        q_to_c[question][answer] += 1
        
    only_before = 0
    only_after = 0
    more_before = 0
    more_after = 0
    equal = 0
    avg_before_diff = 0
    avg_after_diff = 0
    
    for question, a_dict in q_to_c.items():
        if len(a_dict) == 1 and 'before' in a_dict:
            only_before += 1
            more_before += 1
        elif len(a_dict) == 1 and 'after' in a_dict:
            only_after += 1
            more_after += 1
        elif a_dict['before'] == a_dict['after']:
            equal += 1
        elif a_dict['before'] > a_dict['after']:
            more_before += 1
            avg_before_diff += a_dict['before'] - a_dict['after']
        else:
            more_after += 1
            avg_after_diff += a_dict['after'] - a_dict['before']
            
    print(f'For {model_name} data')
    print(f'Equal: {equal}')
    print(f'More Before: {more_before}')
    print(f'Avg Before Delta: {avg_before_diff / (more_before - only_before)}')
    print(f'More After: {more_after}')
    print(f'Avg After Delta: {avg_after_diff / (more_after - only_after)}')
    print(f'Only Before: {only_before}')
    print(f'Only After: {only_after}')
    print()
    
        
check_biases(psac_tr, 'psac')
check_biases(hme_tr, 'hme')
check_biases(hcrn_tr, 'hcrn')

For psac data
Equal: 5817
More Before: 34756
Avg Before Delta: 1.9705656513949321
More After: 34429
Avg After Delta: 1.9655526992287917
Only Before: 30849
Only After: 30539

For hme data
Equal: 5817
More Before: 34756
Avg Before Delta: 1.9705656513949321
More After: 34429
Avg After Delta: 1.9655526992287917
Only Before: 30849
Only After: 30539

For hcrn data
Equal: 5817
More Before: 34756
Avg Before Delta: 1.9705656513949321
More After: 34429
Avg After Delta: 1.9655526992287917
Only Before: 30849
Only After: 30539



In [7]:
# Question 1: What about question length?
lengths = {'before' : 0, 'after' : 0}
answer_counts = Counter()
psac_binaries = psac_tr[(psac_tr['answer'] == 'before') | (psac_tr['answer'] == 'after')]

for row in psac_binaries.iterrows():
    question, answer = row[1]['question'], row[1]['answer']
    sentence = question.lower()
    sentence = sentence.replace(',', '').replace('?', '').replace('\'s', ' \'s')
    words = sentence.split()    
    lengths[answer] += len(words)
    answer_counts[answer] += 1
    
print(f'Average before question length: {lengths["before"] / answer_counts["before"]}')
print(f'Average after question length: {lengths["after"] / answer_counts["after"]}')    

Average before question length: 18.14688764331895
Average after question length: 18.07037643207856


In [8]:
# Question 2: Do biases vary between models? Load pred jsons
import json

# PSAC
with open('/vision/u/momergul/PSAC_2/code_file/data_blind/prediction/FrameQA_prediction.json', 'r') as f:
    psac_preds = json.load(f)
    
# HCRN
with open('/vision/u/momergul/hcrn-videoqa_2/results/expTGIF-QAFrameQA_blind/preds/test_preds.json', 'r') as f:
    hcrn_preds = json.load(f)
    
# HME 
with open('/vision/u/momergul/HME-VideoQA/gif-qa/data/prediction_balanced_blind/FrameQA_test_results-0-csv1.json', 'r') as f:
    hme_preds = json.load(f)
with open('/vision/u/momergul/HME-VideoQA/gif-qa/data/prediction_balanced_blind/FrameQA_test_results-0-csv2.json', 'r') as f:
    hme_preds += json.load(f)

In [9]:
def get_answer_counts(preds, model_name):
    answer_counts = Counter()
    
    for pred in preds:
        answer = pred['prediction']
        if answer in ['before', 'after']:
            answer_counts[answer] += 1
            
    print(f'For {model_name}: {answer_counts}')
    
get_answer_counts(psac_preds, 'psac')
get_answer_counts(hme_preds, 'hme')
get_answer_counts(hcrn_preds, 'hcrn')

For psac: Counter({'after': 24104, 'before': 20742})
For hme: Counter({'before': 25219, 'after': 19617})
For hcrn: Counter({'before': 30427, 'after': 14410})


In [10]:
# Minor point: Do we represent all videos in the training set?
df_tr = pd.read_csv('/vision/u/momergul/hcrn-videoqa/csvs_old/Train_frameqa_question-balanced.csv')

In [11]:
vid_ids = set(vid_id[:5] for vid_id in list(df_tr['vid_id'].values))
len(vid_ids)

7787

In [12]:
import os
train_vid_ids = os.listdir('/vision/u/momergul/AGQA/exports/dataset/balanced/train')
for vid_id in train_vid_ids:
    if vid_id[:5] not in vid_ids:
        print(vid_id)

In [13]:
# Part 3: Figuring out the answer distributions on the test set
test_df = pd.read_csv('/vision/u/momergul/hcrn-videoqa_2/csvs/Test_frameqa_question-balanced.csv')

In [14]:
def check_biases(df, model_name, report_all=False):
    df_binaries = df[(df['answer'] == 'before') | (df['answer'] == 'after')]
    q_to_c = {}
        
    # Get mapping from question to number of before/after answers
    for row in df_binaries.iterrows():
        question, answer = row[1]['question'], row[1]['answer']
        sentence = question.lower()
        sentence = sentence.replace(',', '').replace('?', '').replace('\'s', ' \'s')
        
        if sentence not in q_to_c:
            q_to_c[sentence] = Counter()
        q_to_c[sentence][answer] += 1
        
    only_before = 0
    only_after = 0
    more_before = 0
    more_after = 0
    equal = 0
    avg_before_diff = 0
    avg_after_diff = 0
    
    question_dict = {'only_before' : [], 'only_after' : [], 'more_before' : [],
                    'more_after' : [], 'equal' : []}
    
    for question, a_dict in q_to_c.items():
        if len(a_dict) == 1 and 'before' in a_dict:
            if report_all:
                only_before += a_dict['before']
                more_before += a_dict['before']
            else:
                only_before += 1
                more_before += 1
            question_dict['only_before'].append(question)
        elif len(a_dict) == 1 and 'after' in a_dict:
            if report_all:
                only_after += a_dict['after']
                more_after += a_dict['after']
            else:
                only_after += 1
                more_after += 1
            question_dict['only_after'].append(question)
        elif a_dict['before'] == a_dict['after']:
            if report_all:
                equal += a_dict['before'] + a_dict['after']
            else:
                equal += 1
            question_dict['equal'].append(question)
        elif a_dict['before'] > a_dict['after']:
            if report_all:
                more_before += a_dict['before'] + a_dict['after']
            else:
                more_before += 1
            avg_before_diff += a_dict['before'] - a_dict['after']
            question_dict['more_before'].append(question)
        else:
            if report_all:
                more_after += a_dict['before'] + a_dict['after']
            else:
                more_after += 1
            avg_after_diff += a_dict['after'] - a_dict['before']            
            question_dict['more_after'].append(question)
            
    print(f'For {model_name} data')
    print(f'Equal: {equal}')
    print(f'More Before: {more_before}')
    print(f'Avg Before Delta: {avg_before_diff / (more_before - only_before)}')
    print(f'More After: {more_after}')
    print(f'Avg After Delta: {avg_after_diff / (more_after - only_after)}')
    print(f'Only Before: {only_before}')
    print(f'Only After: {only_after}')
    print()
    
    return question_dict

question_dict = check_biases(test_df, 'test')

For test data
Equal: 2007
More Before: 15274
Avg Before Delta: 1.4421669106881405
More After: 15469
Avg After Delta: 1.436734693877551
Only Before: 14591
Only After: 14734



In [15]:
# Isolate before/after questions in preds
def isolate_before_after(preds):
    new_preds = []
    for pred in preds:
        if pred['answer'] in ['before', 'after']:
            new_preds.append(pred)
    return new_preds

psac_subpreds = isolate_before_after(psac_preds)
hme_subpreds = isolate_before_after(hme_preds)
hcrn_subpreds = isolate_before_after(hcrn_preds)

In [16]:
len(hcrn_subpreds)

44836

In [17]:
# Report success
def extract_question(question, model_name):
    if model_name == 'psac':
        sentence = question.lower()
        sentence = sentence.replace(',', '').replace('?', '').replace('\'s', ' \'s')
        return sentence
    elif model_name == 'hcrn':
        return ' '.join(question)
    else:
        new_question = []
        for token in question:
            if token == '\\?':
                break
            new_question.append(token)
        return ' '.join(new_question)

def report_subset_accuracy(question_dict, preds, model_name):
    accuracies = {'only_before' : {'correct' : 0, 'total' : 0},
                 'only_after' : {'correct' : 0, 'total' : 0},
                 'equal' : {'correct' : 0, 'total' : 0},
                 'more_before' : {'correct' : 0, 'total' : 0},
                 'more_after' : {'correct' : 0, 'total' : 0}}
    
    for pred in preds:
        question = extract_question(pred['question'], model_name)
        correct = 1 if pred['answer'] == pred['prediction'] else 0
        
        if question in question_dict['only_before']:
            accuracies['only_before']['total'] += 1
            accuracies['only_before']['correct'] += correct
        elif question in question_dict['only_after']:
            accuracies['only_after']['total'] += 1
            accuracies['only_after']['correct'] += correct
        elif question in question_dict['equal']:
            accuracies['equal']['total'] += 1
            accuracies['equal']['correct'] += correct
        elif question in question_dict['more_before']:
            accuracies['more_before']['total'] += 1
            accuracies['more_before']['correct'] += correct
        elif question in question_dict['more_after']:
            accuracies['more_after']['total'] += 1
            accuracies['more_after']['correct'] += correct
        else:
            print("This shouldn't be happening")
            
    print(f'{model_name} results')
    print(f'Only before acc: {accuracies["only_before"]["correct"] / accuracies["only_before"]["total"]}')
    print(f'Only after acc: {accuracies["only_after"]["correct"] / accuracies["only_after"]["total"]}')
    print(f'Equal acc: {accuracies["equal"]["correct"] / accuracies["equal"]["total"]}')
    print(f'Equal counts: {accuracies["equal"]["correct"]}, {accuracies["equal"]["total"]}')
    print(f'More before acc: {accuracies["more_before"]["correct"] / accuracies["more_before"]["total"]}')
    print(f'More after acc: {accuracies["more_after"]["correct"] / accuracies["more_after"]["total"]}')
    
report_subset_accuracy(question_dict, psac_subpreds, 'psac')
report_subset_accuracy(question_dict, hcrn_subpreds, 'hcrn')
report_subset_accuracy(question_dict, hme_subpreds, 'hme')

psac results
Only before acc: 0.7231867504283267
Only after acc: 0.7770514364355754
Equal acc: 0.49910714285714286
Equal counts: 2236, 4480
More before acc: 0.5109237255653507
More after acc: 0.5636168691922802
hcrn results
Only before acc: 0.8974871501998858
Only after acc: 0.6177533115430931
Equal acc: 0.5
Equal counts: 2240, 4480
More before acc: 0.6515906477577615
More after acc: 0.42280200142959257
This shouldn't be happening
This shouldn't be happening
This shouldn't be happening
hme results
Only before acc: 0.811411274201839
Only after acc: 0.6973676664563858
Equal acc: 0.5
Equal counts: 2240, 4480
More before acc: 0.556151782292066
More after acc: 0.5114367405289493


In [18]:
temp = check_biases(test_df, 'test', True)

For test data
Equal: 4480
More Before: 20119
Avg Before Delta: 0.37753928708317364
More After: 20237
Avg After Delta: 0.3774124374553252
Only Before: 17510
Only After: 17439



In [19]:
# Question 4: If a question appears in both train and test, is it in the same category usually?

tr_question_dict = check_biases(psac_tr, 'train')

For train data
Equal: 5817
More Before: 34756
Avg Before Delta: 1.9705656513949321
More After: 34429
Avg After Delta: 1.9655526992287917
Only Before: 30849
Only After: 30539



In [20]:
repetitions = {'only_before' : {'correct' : 0, 'total' : 0},
              'only_after' : {'correct' : 0, 'total' : 0},
              'equal' : {'correct' : 0, 'total' : 0},
              'more_before' : {'correct' : 0, 'total' : 0},
              'more_after' : {'correct' : 0, 'total' : 0}}

for category in ['only_before', 'only_after', 'equal', 'more_before', 'more_after']:
    for question in tr_question_dict[category]:
        appears = False
        for cat in ['only_before', 'only_after', 'equal', 'more_before', 'more_after']:
            if question in question_dict[cat]:
                appears = True
        if not appears:
            continue
        
        repetitions[category]['total'] += 1
        if question in question_dict[category]:
            repetitions[category]['correct'] += 1

print(f'Only before acc: {repetitions["only_before"]["correct"] / repetitions["only_before"]["total"]}')
print(f'Only after acc: {repetitions["only_after"]["correct"] / repetitions["only_after"]["total"]}')
print(f'Equal acc: {repetitions["equal"]["correct"] / repetitions["equal"]["total"]}')
print(f'More before acc: {repetitions["more_before"]["correct"] / repetitions["more_before"]["total"]}')
print(f'More after acc: {repetitions["more_after"]["correct"] / repetitions["more_after"]["total"]}')

Only before acc: 0.7174738120540458
Only after acc: 0.7068219137184655
Equal acc: 0.12430939226519337
More before acc: 0.09465020576131687
More after acc: 0.09561752988047809


In [21]:
repetitions = {'only_before' : {'correct' : 0, 'total' : 0},
              'only_after' : {'correct' : 0, 'total' : 0},
              'equal' : {'correct' : 0, 'total' : 0},
              'more_before' : {'correct' : 0, 'total' : 0},
              'more_after' : {'correct' : 0, 'total' : 0}}

for category in ['only_before', 'only_after', 'equal', 'more_before', 'more_after']:
    for question in question_dict[category]:
        appears = False
        for cat in ['only_before', 'only_after', 'equal', 'more_before', 'more_after']:
            if question in tr_question_dict[cat]:
                appears = True
        if not appears:
            continue
        
        repetitions[category]['total'] += 1
        if question in tr_question_dict[category]:
            repetitions[category]['correct'] += 1

print(f'Only before acc: {repetitions["only_before"]["correct"] / repetitions["only_before"]["total"]}')
print(f'Only after acc: {repetitions["only_after"]["correct"] / repetitions["only_after"]["total"]}')
print(f'Equal acc: {repetitions["equal"]["correct"] / repetitions["equal"]["total"]}')
print(f'More before acc: {repetitions["more_before"]["correct"] / repetitions["more_before"]["total"]}')
print(f'More after acc: {repetitions["more_after"]["correct"] / repetitions["more_after"]["total"]}')

Only before acc: 0.5554119167939828
Only after acc: 0.5607531975367125
Equal acc: 0.16822429906542055
More before acc: 0.3174846625766871
More after acc: 0.3112391930835735


In [25]:
# Part 5: Repeat same analysis with programs
def check_program_biases(mode, report_all=True):
    q_to_c = {}
    
    if mode == 'train':
        ranges = [(0, 1110), (1110, 2220), (2220, 3330), (3330, 4440),
                  (4440, 5550), (5550, 6660), (6660, 7787)]
    else:
        ranges = [(0, 303), (303, 555), (555, 807), (807, 1059),
                  (1059, 1311), (1311, 1563), (1563, 1814)]
    
    for r in ranges:
        with open(f'/vision/u/momergul/AGQA/code/omer_additions/program_stats/{mode}_{r[0]}_{r[1]}.pkl', 'rb') as f:
            curr_qtoc = pickle.load(f)
        
        for program, counts in curr_qtoc.items():
            if program not in q_to_c:
                q_to_c[program] = Counter()
            for answer, count in counts.items():
                q_to_c[program][answer] += count

    only_before = 0
    only_after = 0
    more_before = 0
    more_after = 0
    equal = 0
    avg_before_diff = 0
    avg_after_diff = 0
    
    question_dict = {'only_before' : [], 'only_after' : [], 'more_before' : [],
                    'more_after' : [], 'equal' : []}
    
    for question, a_dict in q_to_c.items():
        if len(a_dict) == 1 and 'before' in a_dict:
            if report_all:
                only_before += a_dict['before']
                more_before += a_dict['before']
            else:
                only_before += 1
                more_before += 1
            question_dict['only_before'].append(question)
        elif len(a_dict) == 1 and 'after' in a_dict:
            if report_all:
                only_after += a_dict['after']
                more_after += a_dict['after']
            else:
                only_after += 1
                more_after += 1
            question_dict['only_after'].append(question)
        elif a_dict['before'] == a_dict['after']:
            if report_all:
                equal += a_dict['before'] + a_dict['after']
            else:
                equal += 1
            question_dict['equal'].append(question)
        elif a_dict['before'] > a_dict['after']:
            if report_all:
                more_before += a_dict['before'] + a_dict['after']
            else:
                more_before += 1
            avg_before_diff += a_dict['before'] - a_dict['after']
            question_dict['more_before'].append(question)
        else:
            if report_all:
                more_after += a_dict['before'] + a_dict['after']
            else:
                more_after += 1
            avg_after_diff += a_dict['after'] - a_dict['before']            
            question_dict['more_after'].append(question)
            
    print(f'For {mode} data')
    print(f'Equal: {equal}')
    print(f'More Before: {more_before}')
    if not report_all:
        print(f'Avg Before Delta: {avg_before_diff / (more_before - only_before)}')
    print(f'More After: {more_after}')
    if not report_all:
        print(f'Avg After Delta: {avg_after_diff / (more_after - only_after)}')
    print(f'Only Before: {only_before}')
    print(f'Only After: {only_after}')
    print()

tr_program_dict = check_program_biases('train', True)
te_program_dict = check_program_biases('test', True)

For train data
Equal: 12356
More Before: 74142
More After: 77650
Only Before: 26564
Only After: 26846

For test data
Equal: 5794
More Before: 19436
More After: 19606
Only Before: 10099
Only After: 10256



In [26]:
with open('/vision/u/momergul/AGQA/code/omer_additions/program_stats/tr_program_dict.pkl', 'wb') as f:
    pickle.dump(tr_program_dict, f)
with open('/vision/u/momergul/AGQA/code/omer_additions/program_stats/te_program_dict.pkl', 'wb') as f:
    pickle.dump(te_program_dict, f)

# Fixing the problem

In [12]:
# First save the entire concatenated dataset, without any removal shenanigans
og_tr = pd.read_csv('/vision/u/momergul/hcrn-videoqa_2/csvs_old/Train_frameqa_question-balanced.csv')
added_tr = pd.read_csv('/vision/u/momergul/qdec_misc/february_qdec_data/additional-train-balanced-tgif.csv')
tr_df = pd.concat([og_tr, added_tr])
tr_df.to_csv('/vision/u/momergul/qdec_misc/february_qdec_data/agqa_too_point_oh/entire_train_combined.csv')

og_te = pd.read_csv('/vision/u/momergul/hcrn-videoqa_2/csvs_old/Test_frameqa_question-balanced.csv')
added_te = pd.read_csv('/vision/u/momergul/qdec_misc/february_qdec_data/additional-test-balanced-tgif.csv')
te_df = pd.concat([og_te, added_te])
te_df.to_csv('/vision/u/momergul/qdec_misc/february_qdec_data/agqa_too_point_oh/entire_test_combined.csv')

In [61]:
tr_df = pd.read_csv('/vision/u/momergul/qdec_misc/february_qdec_data/combined/train-balanced_combined-tgif.csv')
tr_df.shape[0]

1718292

In [62]:
# Get the concatenated, sliced and sorted forms of the AGQA 2.0 datasets

# TRAIN
#og_tr = pd.read_csv('/vision/u/momergul/hcrn-videoqa_2/csvs_old/Train_frameqa_question-balanced.csv')
#added_tr = pd.read_csv('/vision/u/momergul/qdec_misc/february_qdec_data/additional-train-balanced-tgif.csv')
#tr_df = pd.concat([og_tr, added_tr])
tr_df = pd.read_csv('/vision/u/momergul/qdec_misc/february_qdec_data/combined/train-balanced_combined-tgif.csv')
tr_df = tr_df[(tr_df['answer'] == 'before') | (tr_df['answer'] == 'after')]
tr_df = tr_df.sort_values(by=['vid_id'], ascending=True)
tr_df.to_csv('/vision/u/momergul/qdec_misc/february_qdec_data/temp.csv')
tr_df = pd.read_csv('/vision/u/momergul/qdec_misc/february_qdec_data/temp.csv')

# TEST
#og_te = pd.read_csv('/vision/u/momergul/hcrn-videoqa_2/csvs_old/Test_frameqa_question-balanced.csv')
#added_te = pd.read_csv('/vision/u/momergul/qdec_misc/february_qdec_data/additional-test-balanced-tgif.csv')
#te_df = pd.concat([og_te, added_te])
te_df = pd.read_csv('/vision/u/momergul/qdec_misc/february_qdec_data/combined/test-balanced_combined-tgif.csv')
te_df = te_df[(te_df['answer'] == 'before') | (te_df['answer'] == 'after')]
te_df = te_df.sort_values(by=['vid_id'], ascending=True)

In [63]:
from collections import Counter
answers = list(tr_df['answer'].values)
answer_counts = Counter()
for answer in answers:
    answer_counts[answer] += 1
    
print(answer_counts)

Counter({'before': 109213, 'after': 109213})


In [64]:
# Remove problematic questions from train
import json
import os

# Load the problematic id files
problematic_ids = []
filenames = os.listdir('/vision/u/momergul/qdec_misc/february_qdec_data/more')
for file in filenames:
    with open(f'/vision/u/momergul/qdec_misc/february_qdec_data/more/{file}', 'r') as f:
        problematic_ids += json.load(f)
problematic_ids = set(problematic_ids)
print(len(problematic_ids))

117392


In [65]:
print(tr_df.shape[0])

218426


In [66]:
# Remove from train
tr_df.drop(tr_df[tr_df.key.isin(problematic_ids)].index, inplace=True)
print(tr_df.shape[0])

101063


In [67]:
print(218426 - 101001)

117425


In [68]:
# Save the new train and test sets
tr_df.to_csv('/vision/u/momergul/qdec_misc/february_qdec_data/only_before_after/Train_frameqa_question-balanced.csv')
te_df.to_csv('/vision/u/momergul/qdec_misc/february_qdec_data/only_before_after/Test_frameqa_question-balanced.csv')

total_df = pd.concat([tr_df, te_df])
total_df.to_csv('/vision/u/momergul/qdec_misc/february_qdec_data/only_before_after/Total_frameqa_question-balanced.csv')

In [69]:
tr_df = pd.read_csv('/vision/u/momergul/qdec_misc/february_qdec_data/only_before_after/Train_frameqa_question-balanced.csv')

from collections import Counter
answers = list(tr_df['answer'].values)
answer_counts = Counter()
for answer in answers:
    answer_counts[answer] += 1
    
print(answer_counts)

Counter({'before': 50543, 'after': 50520})


In [70]:
len(list(tr_df['key'].values)) - len(set(list(tr_df['key'].values)))

0

#      repetition repetition repetition repetition repetition repetition repetition 

In [20]:
# Question 1: Is there repetition on the original datasets?
og_tr = pd.read_csv('/vision/u/momergul/hcrn-videoqa_2/csvs_old/Train_frameqa_question-balanced.csv')
print(f'There are no repetitions on the original: {len(list(og_tr["key"].values)) == len(set(list(og_tr["key"].values)))}')

added_tr = pd.read_csv('/vision/u/momergul/qdec_misc/february_qdec_data/additional-train-balanced-tgif.csv')
print(f'There are no repetitions on the additions: {len(list(added_tr["key"].values)) == len(set(list(added_tr["key"].values)))}')

tr_df = pd.concat([og_tr, added_tr])
tr_df = tr_df[(tr_df['answer'] == 'before') | (tr_df['answer'] == 'after')]
tr_df = tr_df.sort_values(by=['vid_id'], ascending=True)
print(f'There are no repetitions on the combination: {len(list(tr_df["key"].values)) == len(set(list(tr_df["key"].values)))}')

There are no repetitions on the original: True
There are no repetitions on the additions: True
There are no repetitions on the combination: False


In [21]:
# Question 2: Report the number of repetitions in the combo
print(f'Number of repetitions on the combination: {len(list(tr_df["key"].values)) - len(set(list(tr_df["key"].values)))}')

Number of repetitions on the combination: 68


In [22]:
# Question 3: Are there repetitions in the problematic questions?
import json, os

problematic_ids = []
filenames = os.listdir('/vision/u/momergul/qdec_misc/february_qdec_data/only')
for file in filenames:
    with open(f'/vision/u/momergul/qdec_misc/february_qdec_data/only/{file}', 'r') as f:
        problematic_ids += json.load(f)
        
print(f"There are no repetitions in the ids: {len(problematic_ids) == len(set(problematic_ids))}")
print(f'Original length: {len(problematic_ids)}')
print(f'Set length: {len(set(problematic_ids))}')

There are no repetitions in the ids: False
Original length: 76312
Set length: 76310


In [23]:
# Question 4: Are there repetitions in the final dataset?
final_tr = pd.read_csv('/vision/u/momergul/hcrn-videoqa_2/csvs/Train_frameqa_question-balanced.csv')
print(f'There are no repetitions on the original: {len(list(final_tr["key"].values)) == len(set(list(final_tr["key"].values)))}')
print(f'Difference: {len(list(final_tr["key"].values)) - len(set(list(final_tr["key"].values)))}')

There are no repetitions on the original: False
Difference: 53


In [27]:
tr_df.to_csv('/vision/u/momergul/qdec_misc/february_qdec_data/temp.csv')
final_df = pd.read_csv('/vision/u/momergul/qdec_misc/february_qdec_data/temp.csv')
final_df.drop(final_df[final_df.key.isin(problematic_ids)].index, inplace=True)

tr_df.shape[0] - final_df.shape[0]

76323

In [28]:
len(list(final_df['key'].values)) - len(set(list(final_df['key'].values)))

55