In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import sqlite3 as lite
from sqlite3 import Error
from pathlib import Path
from datetime import date
import numpy as np
import matplotlib.ticker as tick
import requests
import difflib as diff
import re 
import csv
import ast
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
%matplotlib inline

# pd.set_option('mode.chained_assignment', None)

def create_connection(db_file):
    """
    create a connection to sqlite3 database
    """
    conn = None
    try:
        conn = lite.connect(db_file, timeout=10)  # connection via sqlite3
    except Error as e:
        print(e)
    return conn

conn = create_connection("../CVEfixes.db")

query = """
    SELECT m.code, m.before_change, c.committer_date
    FROM file_change f, method_change m, commits c
    WHERE m.file_change_id = f.file_change_id
    AND c.hash = f.hash
    AND f.programming_language = 'Python';
"""

df = pd.read_sql_query(query, conn)
df

Unnamed: 0,code,before_change,committer_date
0,"def call_with_ns(f, ns, arg=1):\n td = Rtd(...",False,2009-12-02 15:43:39+00:00
1,def taintWrapper(self):\n ...,False,2009-12-02 15:43:39+00:00
2,def test_call_with_request_preserves_taint...,False,2009-12-02 15:43:39+00:00
3,"def boboAwareZopeTraverse(object, path_items, ...",False,2021-05-21 09:11:02+02:00
4,"def traverse(cls, base, request, path_item...",False,2021-05-21 09:11:02+02:00
...,...,...,...
7571,def download_project_pdf():\n project_name ...,True,2024-06-08 17:30:55+05:30
7572,def download_project_pdf():\n project_name ...,False,2024-06-08 17:30:55+05:30
7573,def project_files():\n project_name = secur...,False,2024-06-08 17:30:55+05:30
7574,"def get_project_files(self, project_name: ...",False,2024-06-08 17:30:55+05:30


In [9]:
df = df.drop_duplicates(subset=['code'], ignore_index=True)
df = df.rename(columns={'before_change': 'label', 'code': 'text'})
df.loc[df.label == 'False', 'label'] = 0
df.loc[df.label == 'True', 'label'] = 1
print('Vulnerable:', len(df[df['label'] == 1]))
print('Not vulnerable:', len(df[df['label'] == 0]))

Vulnerable: 2616
Not vulnerable: 4745


In [6]:
# Save dataset without preprocessing ready for training
from sklearn.model_selection import train_test_split

df_nopreproc = df.drop(df.columns[[2]], axis=1)

train, test = train_test_split(df_nopreproc, test_size=0.2)
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)
test, validation = train_test_split(test, test_size=0.5)
train.to_json('python_nopreproc_train.json', orient='records')
validation.to_json('python_nopreproc_valid.json', orient='records')
test.to_json('python_nopreproc_test.json', orient='records')

In [7]:
# Balance dataset

# Randomly drop non vulnerable code until it reaches a 50/50 split
to_drop = len(df[df['label'] == 0]) - len(df[df['label'] == 1])
frac_to_drop = to_drop/len(df[df['label'] == 0])
df_balanced = df.drop(df[df['label'] == 0].sample(frac=frac_to_drop).index)
df_balanced = df_balanced.reset_index(drop=True)
df_balanced = df_balanced.drop(df.columns[[2]], axis=1)

# Split and save 80/10/10
train, test = train_test_split(df_balanced, test_size=0.2)
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)
test, validation = train_test_split(test, test_size=0.5)
train.to_json('python_balanced_train.json', orient='records')
validation.to_json('python_balanced_valid.json', orient='records')
test.to_json('python_balanced_test.json', orient='records')

In [11]:
# Prevent time traveling

#df['committer_date'] = pd.to_datetime(df['committer_date'])
df = df.sort_values(by='committer_date')
dfx = df.drop(df.columns[[2]], axis=1)
split_index = int(len(df) * 0.8)
train = dfx.iloc[:split_index]
test = dfx.iloc[split_index:]
test, validation = train_test_split(test, test_size=0.5)
print(len(train), len(test), len(validation))
train.to_json('python_date_train.json', orient='records')
validation.to_json('python_date_valid.json', orient='records')
test.to_json('python_date_test.json', orient='records')

5888 736 737
