In [4]:
import json

def filter_downloaded_videos(json_file_path, downloaded_video_ids_file, output_file_path):
    """
    Remove entries for downloaded videos from the JSON file.

    Parameters:
    - json_file_path (str): Path to the input JSON file.
    - downloaded_video_ids_file (str): Path to the file containing downloaded video IDs (one ID per line).
    - output_file_path (str): Path to save the updated JSON file.

    Returns:
    - None
    """
    try:
        # Load the JSON data
        with open(json_file_path, 'r') as json_file:
            data = json.load(json_file)
        
        # Load the downloaded video IDs
        downloaded_ids = set()
        with open(downloaded_video_ids_file, 'r') as id_file:
            for line in id_file.readlines():
                line = line.strip()
                video_id = line.split('.')[0]
                downloaded_ids.add(video_id) 
        
        # Filter the JSON data
        filtered_data = []
        for entry in data:
            filtered_instances = [
                inst for inst in entry['instances'] 
                if inst['video_id'] not in downloaded_ids
            ]
            if filtered_instances:  # Only keep entries with remaining instances
                entry['instances'] = filtered_instances
                filtered_data.append(entry)
        
        # Save the filtered data
        with open(output_file_path, 'w') as output_file:
            json.dump(filtered_data, output_file, indent=4)
        
        print(f"Filtered JSON saved to {output_file_path}")
    except Exception as e:
        print(f"An error occurred: {e}")



In [5]:
json_file_path = "updated.json"
downloaded_video_ids_file = "fileNames.txt"
output_file_path = "filtered_data2.json"

filter_downloaded_videos(json_file_path, downloaded_video_ids_file, output_file_path)

Filtered JSON saved to filtered_data2.json


In [1]:
downloaded_video_ids_file = "fileNames.txt"

downloaded_ids = set()
with open(downloaded_video_ids_file, 'r') as id_file:
    for line in id_file.readlines():
        line = line.strip()
        video_id = line.split('.')[0]
        downloaded_ids.add(video_id) 

In [2]:
len(downloaded_ids)

11729

In [6]:
content = json.load(open(output_file_path))
count = 0
for entry in content:
    gloss = entry['gloss']
    instances = entry['instances']

    for inst in instances:
        count += 1

count

9354

In [17]:
newDownload = "fileNames.txt"

downloaded_ids = set()
with open(newDownload, 'r') as id_file:
    for line in id_file.readlines():
        line = line.strip()
        video_id = line.split('.')[0]
        downloaded_ids.add(video_id) 

oldDownload = "oldFiles.txt"
downloaded_ids_old = set()
with open(oldDownload, 'r') as id_file:
    for line in id_file.readlines():
        line = line.strip()
        video_id = line.split('.')[0]
        downloaded_ids_old.add(video_id) 


In [18]:
downloaded_ids_old.difference(downloaded_ids)

{'0UsjUE-TXns'}

In [7]:
with open(json_file_path, 'r') as json_file:
    data = json.load(json_file)

In [None]:
downloaded_ids = set()
with open(downloaded_video_ids_file, 'r') as id_file:
    for line in id_file.readlines():
        line = line.strip()
        video_id = line.split('.')[0]
        downloaded_ids.add(video_id) 

In [15]:
total_count=  {}
words = []
for entry in data:
    #print (entry['gloss'])
    flagToContinue = False
    count = 0
    for inst in entry['instances']:
        count += 1
    
    words.append(entry['gloss'])
    total_count[entry['gloss']] = count
total_count

{'book': 40,
 'drink': 35,
 'computer': 30,
 'before': 26,
 'chair': 26,
 'go': 26,
 'clothes': 25,
 'who': 25,
 'candy': 24,
 'cousin': 23,
 'deaf': 23,
 'fine': 22,
 'help': 22,
 'no': 22,
 'thin': 22,
 'walk': 22,
 'year': 22,
 'yes': 22,
 'all': 21,
 'black': 21,
 'cool': 21,
 'finish': 21,
 'hot': 21,
 'like': 21,
 'many': 21,
 'mother': 21,
 'now': 21,
 'orange': 21,
 'table': 21,
 'thanksgiving': 21,
 'what': 21,
 'woman': 21,
 'bed': 20,
 'blue': 20,
 'bowling': 20,
 'can': 20,
 'dog': 20,
 'family': 20,
 'fish': 20,
 'graduate': 20,
 'hat': 20,
 'hearing': 20,
 'kiss': 20,
 'language': 20,
 'later': 20,
 'man': 20,
 'shirt': 20,
 'study': 20,
 'tall': 20,
 'white': 20,
 'wrong': 20,
 'accident': 19,
 'apple': 19,
 'bird': 19,
 'change': 19,
 'color': 19,
 'corn': 19,
 'cow': 19,
 'dance': 19,
 'dark': 19,
 'doctor': 19,
 'eat': 19,
 'enjoy': 19,
 'forget': 19,
 'give': 19,
 'last': 19,
 'meet': 19,
 'pink': 19,
 'pizza': 19,
 'play': 19,
 'school': 19,
 'secretary': 19,
 'shor

In [13]:
d=  {}
words = []
for entry in data:
    #print (entry['gloss'])
    flagToContinue = False
    count = 0
    for inst in entry['instances']:
        if (inst['video_id'] in downloaded_ids):
            count += 1
    
    words.append(entry['gloss'])
    d[entry['gloss']] = count

In [27]:
qualified_words = []
for key in d:
    if (d[key]/total_count[key] > 0.5):
        qualified_words.append(key)
    
len(qualified_words), qualified_words

(1237,
 ['drink',
  'computer',
  'before',
  'go',
  'who',
  'candy',
  'cousin',
  'deaf',
  'fine',
  'help',
  'no',
  'thin',
  'year',
  'yes',
  'all',
  'black',
  'cool',
  'finish',
  'hot',
  'like',
  'many',
  'mother',
  'now',
  'orange',
  'thanksgiving',
  'woman',
  'bed',
  'blue',
  'bowling',
  'dog',
  'family',
  'fish',
  'graduate',
  'hat',
  'hearing',
  'kiss',
  'language',
  'later',
  'man',
  'shirt',
  'study',
  'tall',
  'white',
  'wrong',
  'accident',
  'apple',
  'bird',
  'change',
  'color',
  'corn',
  'cow',
  'dance',
  'dark',
  'doctor',
  'forget',
  'give',
  'last',
  'meet',
  'pink',
  'pizza',
  'play',
  'school',
  'secretary',
  'short',
  'want',
  'work',
  'africa',
  'basketball',
  'brown',
  'but',
  'cheat',
  'city',
  'cook',
  'decide',
  'full',
  'letter',
  'paint',
  'paper',
  'pull',
  'purple',
  'right',
  'same',
  'son',
  'tell',
  'thursday',
  'visit',
  'wait',
  'water',
  'yellow',
  'backpack',
  'bar',


In [28]:
total_videos = 0
for word in qualified_words:
    total_videos += d[word]

total_videos

8407