### Aggregate and Parse Virustotal Data

In [4]:
import os
import json
import pandas as pd
import logging

# Set the path of the folder containing the JSON files
folder_path = "payloads_json"

In [5]:
# Create an empty list to store the dictionaries from the JSON files
json_dicts = []

# Create an empty list to store the extracted data
data = []

# Log file
logging.basicConfig(filename='json_parse_errors.log', level=logging.ERROR, format='%(asctime)s - %(levelname)s - %(message)s')

# Loop over the files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.json'):
        # Load the JSON data from the file
        with open(os.path.join(folder_path, filename), 'r') as f:
            try:
                json_data = json.load(f)
        
                # Extract the required data from the JSON data
                last_analysis_stats = json_data['data']['attributes']['last_analysis_stats']
                harmless = last_analysis_stats['harmless']
                malicious = last_analysis_stats['malicious']
                suspicious = last_analysis_stats['suspicious']
                undetected = last_analysis_stats['undetected']
                timeout = last_analysis_stats['timeout']

                categories = json_data['data']['attributes']['categories']
                forcepoint = categories.get('Forcepoint ThreatSeeker')
                sophos = categories.get('Sophos')
                bitdefender = categories.get('BitDefender')
                comodo = categories.get("Comodo Valkyrie Verdict")
                alphamnt = categories.get('alphaMountain.ai')
            except json.JSONDecodeError as e:
                # Log the error
                logging.error(f"JSONDecodeError: {e} in file {filename}")
                    
            except Exception as e:
                # Log other unexpected errors
                logging.error(f"Error processing file {filename}: {e}")

        # Add the extracted data to the list
        data.append([filename[:-5], harmless, malicious, suspicious, undetected, timeout, forcepoint, sophos, bitdefender, comodo, alphamnt])

# Convert the list of data to a Pandas DataFrame
df = pd.DataFrame(data, columns=['filename', 'harmless', 'malicious', 'suspicious', 'undetected', 'timeout', 'forcepoint', 'sophos', 'bitdefender', 'comodo', 'alphamnt'])

In [6]:
df.head()

Unnamed: 0,filename,harmless,malicious,suspicious,undetected,timeout,forcepoint,sophos,bitdefender,comodo,alphamnt
0,taxfoundation.org,69,0,0,18,0,financial data and services,general business,financial,,
1,simfil.es,67,1,0,20,0,information technology,,misc,,File Sharing/Storage
2,freespincasinogo.com,65,0,0,22,0,,,gambling,media sharing,
3,nascar.com,68,0,0,19,0,sports,sports,sports,unknown,"Hobbies/Recreation, Sports"
4,hugregregy.pro,65,1,0,21,0,,,,media sharing,


In [7]:
df['malicious'].sum()

7555

In [8]:
df.shape

(63662, 11)

In [9]:
df.to_csv("yg_virustotal_dat.csv", index = False)