In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import sqlite3 as lite
from sqlite3 import Error
from pathlib import Path
from datetime import date
import numpy as np
import matplotlib.ticker as tick
import requests
import difflib as diff
import re 
import csv
import ast
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
%matplotlib inline

# pd.set_option('mode.chained_assignment', None)

def create_connection(db_file):
    """
    create a connection to sqlite3 database
    """
    conn = None
    try:
        conn = lite.connect(db_file, timeout=10)  # connection via sqlite3
    except Error as e:
        print(e)
    return conn

conn = create_connection("../CVEfixes.db")

query = """
    SELECT m.code, m.before_change, c.committer_date
    FROM file_change f, method_change m, commits c
    WHERE m.file_change_id = f.file_change_id
    AND c.hash = f.hash
    AND f.programming_language IN ('C', 'C++');
"""

df = pd.read_sql_query(query, conn)
df

Unnamed: 0,code,before_change,committer_date
0,"_dl_dst_count (const char *name, int is_path)\...",False,1999-11-10 02:42:49+00:00
1,"_dl_dst_count (const char *name, int is_path)\...",True,1999-11-10 02:42:49+00:00
2,"_dl_dst_substitute (struct link_map *l, const ...",True,1999-11-10 02:42:49+00:00
3,"_dl_dst_substitute (struct link_map *l, const ...",False,1999-11-10 02:42:49+00:00
4,expand_dynamic_string_token (struct link_map *...,True,1999-11-10 02:42:49+00:00
...,...,...,...
20055,"HeifInput::seek_subimage(int subimage, int mip...",False,2024-07-15 09:30:17-04:00
20056,"HeifInput::seek_subimage(int subimage, int mip...",True,2024-07-15 09:30:17-04:00
20057,static bool gguf_ex_read_0(const std::string &...,False,2024-07-20 17:15:42+03:00
20058,struct gguf_context * gguf_init_from_file(cons...,False,2024-07-20 17:15:42+03:00


In [2]:
df = df.drop_duplicates(subset=['code'], ignore_index=True)

In [3]:
df = df.rename(columns={'before_change': 'label', 'code': 'text'})
df

Unnamed: 0,text,label,committer_date
0,"_dl_dst_count (const char *name, int is_path)\...",False,1999-11-10 02:42:49+00:00
1,"_dl_dst_count (const char *name, int is_path)\...",True,1999-11-10 02:42:49+00:00
2,"_dl_dst_substitute (struct link_map *l, const ...",True,1999-11-10 02:42:49+00:00
3,"_dl_dst_substitute (struct link_map *l, const ...",False,1999-11-10 02:42:49+00:00
4,expand_dynamic_string_token (struct link_map *...,True,1999-11-10 02:42:49+00:00
...,...,...,...
19360,"HeifInput::seek_subimage(int subimage, int mip...",False,2024-07-15 09:30:17-04:00
19361,"HeifInput::seek_subimage(int subimage, int mip...",True,2024-07-15 09:30:17-04:00
19362,static bool gguf_ex_read_0(const std::string &...,False,2024-07-20 17:15:42+03:00
19363,struct gguf_context * gguf_init_from_file(cons...,False,2024-07-20 17:15:42+03:00


In [4]:
df.loc[df.label == 'False', 'label'] = 0
df.loc[df.label == 'True', 'label'] = 1
df

Unnamed: 0,text,label,committer_date
0,"_dl_dst_count (const char *name, int is_path)\...",0,1999-11-10 02:42:49+00:00
1,"_dl_dst_count (const char *name, int is_path)\...",1,1999-11-10 02:42:49+00:00
2,"_dl_dst_substitute (struct link_map *l, const ...",1,1999-11-10 02:42:49+00:00
3,"_dl_dst_substitute (struct link_map *l, const ...",0,1999-11-10 02:42:49+00:00
4,expand_dynamic_string_token (struct link_map *...,1,1999-11-10 02:42:49+00:00
...,...,...,...
19360,"HeifInput::seek_subimage(int subimage, int mip...",0,2024-07-15 09:30:17-04:00
19361,"HeifInput::seek_subimage(int subimage, int mip...",1,2024-07-15 09:30:17-04:00
19362,static bool gguf_ex_read_0(const std::string &...,0,2024-07-20 17:15:42+03:00
19363,struct gguf_context * gguf_init_from_file(cons...,0,2024-07-20 17:15:42+03:00


In [5]:
print('Vulnerable:', len(df[df['label'] == 1]))
print('Not vulnerable:', len(df[df['label'] == 0]))

Vulnerable: 7810
Not vulnerable: 11555


In [11]:
# Prevent time traveling
from sklearn.model_selection import train_test_split
#df['committer_date'] = pd.to_datetime(df['committer_date'])
df = df.sort_values(by='committer_date')
dfx = df.drop(df.columns[[2]], axis=1)
split_index = int(len(df) * 0.8)
train = dfx.iloc[:split_index]
test = dfx.iloc[split_index:]
test, validation = train_test_split(test, test_size=0.5)
train.to_json('c_date_train.json', orient='records')
validation.to_json('c_date_valid.json', orient='records')
test.to_json('c_date_test.json', orient='records')