**Title**: CVE dataset train test split\
**Description**: CVE 80/20 train test split\
**Developer**: Teck Lim\
**Create date**: 04/06/2021

# Import packages

In [None]:
import os
import pandas as pd
import json
from google.colab import drive

# Load data

In [None]:
drive.mount('/content/gdrive')
!dir './gdrive/Shareddrives/ucsd_drive/Data'

In [None]:
file_path = './gdrive/Shareddrives/ucsd_drive/Data/cve.json'
with open(file_path, 'r') as fp:
    data = json.load(fp) 
print('Total CVEs: {}'.format(len(data)))

In [None]:
cve_id = list()
last_modified_date = list()
published_date = list()
attack_vector = list()
attack_complexity = list()
privileges_required = list()
user_interaction = list()
scope = list()
confidentiality = list()
integrity = list()
availability = list()
description = list()
base_score = list()
exploitability_score = list()
impact_score = list()

for idx in range(len(data)):
    try:
        if data[idx].get('impact') and data[idx]['impact'].get('baseMetricV3'):
            cve_id.append(data[idx]['cve']['CVE_data_meta']['ID'])
            attack_vector.append(data[idx]['impact']['baseMetricV3']['cvssV3']['attackVector'])
            attack_complexity.append(data[idx]['impact']['baseMetricV3']['cvssV3']['attackComplexity'])
            privileges_required.append(data[idx]['impact']['baseMetricV3']['cvssV3']['privilegesRequired'])
            user_interaction.append(data[idx]['impact']['baseMetricV3']['cvssV3']['userInteraction'])
            scope.append(data[idx]['impact']['baseMetricV3']['cvssV3']['scope'])
            confidentiality.append(data[idx]['impact']['baseMetricV3']['cvssV3']['confidentialityImpact'])
            integrity.append(data[idx]['impact']['baseMetricV3']['cvssV3']['integrityImpact'])
            availability.append(data[idx]['impact']['baseMetricV3']['cvssV3']['availabilityImpact'])
            description.append(' '.join([text['value'] for text in data[idx]['cve']['description']['description_data']]))
            last_modified_date.append(data[idx]['lastModifiedDate'])
            published_date.append(data[idx]['publishedDate'])
            base_score.append(data[idx]['impact']['baseMetricV3']['cvssV3']['baseScore'])
            exploitability_score.append(data[idx]['impact']['baseMetricV3']['exploitabilityScore'])
            impact_score.append(data[idx]['impact']['baseMetricV3']['impactScore'])
    except KeyError:
        print('Key error at index: {}'.format(idx))
        break

df = pd.DataFrame({'cve_id': cve_id,
                   'attack_vector': attack_vector, 
                   'attack_complexity': attack_complexity,
                   'privileges_required': privileges_required,
                   'user_interaction': user_interaction,
                   'scope': scope,
                   'confidentiality': confidentiality,
                   'integrity': integrity,
                   'availability': availability,
                   'description': description,
                   'last_modified_date': last_modified_date,
                   'published_date': published_date,
                   'base_score': base_score,
                   'exploitability_score': exploitability_score,
                   'impact_score': impact_score,
                   })
print('Total CVEs with CVSS base score: {}'.format(len(df)))
print('Total percentage: {}'.format(len(df) / len(data)))

# Train test split

In [None]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.2, random_state=88)
print('Train-test 80/20 split: {}, {}'.format(len(df_train), len(df_test)))

In [None]:
df_test.sample(3, random_state=88)

# Save the split train test data

In [None]:
import os

# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
output_dir = './gdrive/Shareddrives/ucsd_drive/Data'

# Create output directory if needed
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print('Saving data to %s' % output_dir)

# Save output csv
df_train.to_csv(os.path.join(output_dir, r'cve_train.csv'), header=True)
df_test.to_csv(os.path.join(output_dir, r'cve_test.csv'), header=True)