In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import sqlite3 as lite
from sqlite3 import Error
from pathlib import Path
from datetime import date
import numpy as np
import matplotlib.ticker as tick
import requests
import difflib as diff
import re 
import csv
import ast
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
%matplotlib inline

# pd.set_option('mode.chained_assignment', None)

def create_connection(db_file):
    """
    create a connection to sqlite3 database
    """
    conn = None
    try:
        conn = lite.connect(db_file, timeout=10)  # connection via sqlite3
    except Error as e:
        print(e)
    return conn

conn = create_connection("../CVEfixes.db")

query = """
    SELECT m.code, m.before_change, c.committer_date
    FROM file_change f, method_change m, commits c
    WHERE m.file_change_id = f.file_change_id
    AND c.hash = f.hash
    AND f.programming_language = 'Java';
"""

df = pd.read_sql_query(query, conn)
df

Unnamed: 0,code,before_change,committer_date
0,static __u8 *kye_report_fixup(struct hid_devic...,True,2014-08-21 10:43:28-05:00
1,static __u8 *kye_report_fixup(struct hid_devic...,False,2014-08-21 10:43:28-05:00
2,"void add_interrupt_randomness(int irq, int irq...",False,2020-07-29 10:35:37-07:00
3,"boolean safeEquals(String s1, String s2) {...",False,2010-07-15 14:27:13+00:00
4,public Authentication getAuthentication(Ht...,False,2010-07-15 14:27:13+00:00
...,...,...,...
8414,"private void writeString(Part currentPart,...",False,2024-07-18 10:04:46-10:00
8415,"void testInvalidMethodSuffix(Position p, P...",False,2024-07-18 10:04:46-10:00
8416,public void initialize(ServiceExtensionCon...,True,2024-05-03 07:49:42+02:00
8417,void setUp(ServiceExtensionContext context...,False,2024-05-03 07:49:42+02:00


In [7]:
df = df.drop_duplicates(subset=['code'], ignore_index=True)

In [9]:
df = df.rename(columns={'before_change': 'label', 'code': 'text'})
df

Unnamed: 0,text,label,committer_date
0,static __u8 *kye_report_fixup(struct hid_devic...,True,2014-08-21 10:43:28-05:00
1,static __u8 *kye_report_fixup(struct hid_devic...,False,2014-08-21 10:43:28-05:00
2,"void add_interrupt_randomness(int irq, int irq...",False,2020-07-29 10:35:37-07:00
3,"boolean safeEquals(String s1, String s2) {...",False,2010-07-15 14:27:13+00:00
4,public Authentication getAuthentication(Ht...,False,2010-07-15 14:27:13+00:00
...,...,...,...
7936,"private void writeString(Part currentPart,...",False,2024-07-18 10:04:46-10:00
7937,"void testInvalidMethodSuffix(Position p, P...",False,2024-07-18 10:04:46-10:00
7938,public void initialize(ServiceExtensionCon...,True,2024-05-03 07:49:42+02:00
7939,void setUp(ServiceExtensionContext context...,False,2024-05-03 07:49:42+02:00


In [10]:
df.loc[df.label == 'False', 'label'] = 0
df.loc[df.label == 'True', 'label'] = 1
df

Unnamed: 0,text,label,committer_date
0,static __u8 *kye_report_fixup(struct hid_devic...,1,2014-08-21 10:43:28-05:00
1,static __u8 *kye_report_fixup(struct hid_devic...,0,2014-08-21 10:43:28-05:00
2,"void add_interrupt_randomness(int irq, int irq...",0,2020-07-29 10:35:37-07:00
3,"boolean safeEquals(String s1, String s2) {...",0,2010-07-15 14:27:13+00:00
4,public Authentication getAuthentication(Ht...,0,2010-07-15 14:27:13+00:00
...,...,...,...
7936,"private void writeString(Part currentPart,...",0,2024-07-18 10:04:46-10:00
7937,"void testInvalidMethodSuffix(Position p, P...",0,2024-07-18 10:04:46-10:00
7938,public void initialize(ServiceExtensionCon...,1,2024-05-03 07:49:42+02:00
7939,void setUp(ServiceExtensionContext context...,0,2024-05-03 07:49:42+02:00


In [14]:
print('Vulnerable:', len(df[df['label'] == 1]))
print('Not vulnerable:', len(df[df['label'] == 0]))

Vulnerable: 2856
Not vulnerable: 5085


In [15]:
# Randomly drop non vulnerable code until it reaches a 50/50 split
df_balanced = df.drop(df[df['label'] == 0].sample(frac=.43834808).index)
df_balanced = df_balanced.reset_index()
df_balanced

In [17]:
print('Vulnerable:', len(df_balanced[df_balanced['label'] == 1]))
print('Not vulnerable:', len(df_balanced[df_balanced['label'] == 0]))

Vulnerable: 2856
Not vulnerable: 2856


In [24]:
df.to_json('java_unprocessed_date.json', orient='records')
df_balanced.to_json('java_balanced_date.json', orient='records')

In [27]:
df_balanced_nodate = df_balanced.drop(df_balanced.columns[[0,3]], axis=1)
df_balanced_nodate

Unnamed: 0,text,label
0,static __u8 *kye_report_fixup(struct hid_devic...,1
1,"void add_interrupt_randomness(int irq, int irq...",0
2,"boolean safeEquals(String s1, String s2) {...",0
3,public Authentication getAuthentication(Ht...,0
4,public Authentication getAuthentication(Ht...,1
...,...,...
5707,private static BinaryHttpRequest readReque...,0
5708,private static Stream<Arguments> invalidCh...,0
5709,private static void writeString(ByteBuf ou...,0
5710,public void initialize(ServiceExtensionCon...,1


In [28]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df_balanced_nodate, test_size=0.2)
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

In [29]:
train.to_json('java_train.json', orient='records')
test.to_json('java_test.json', orient='records')

In [30]:
!pwd

/lunarc/nobackup/projects/lu2024-17-13/kevin/java


In [32]:
df = pd.read_json('java_balanced_date.json')
df

Unnamed: 0,index,text,label,committer_date
0,0,static __u8 *kye_report_fixup(struct hid_devic...,1,2014-08-21 10:43:28-05:00
1,2,"void add_interrupt_randomness(int irq, int irq...",0,2020-07-29 10:35:37-07:00
2,3,"boolean safeEquals(String s1, String s2) {...",0,2010-07-15 14:27:13+00:00
3,4,public Authentication getAuthentication(Ht...,0,2010-07-15 14:27:13+00:00
4,5,public Authentication getAuthentication(Ht...,1,2010-07-15 14:27:13+00:00
...,...,...,...,...
5707,7932,private static BinaryHttpRequest readReque...,0,2024-07-18 10:04:46-10:00
5708,7933,private static Stream<Arguments> invalidCh...,0,2024-07-18 10:04:46-10:00
5709,7935,private static void writeString(ByteBuf ou...,0,2024-07-18 10:04:46-10:00
5710,7938,public void initialize(ServiceExtensionCon...,1,2024-05-03 07:49:42+02:00


In [33]:
df['committer_date'] = pd.to_datetime(df['committer_date'])

  df['committer_date'] = pd.to_datetime(df['committer_date'])


In [34]:
df = df.sort_values(by='committer_date')
df

Unnamed: 0,index,text,label,committer_date
2,3,"boolean safeEquals(String s1, String s2) {...",0,2010-07-15 14:27:13+00:00
3,4,public Authentication getAuthentication(Ht...,0,2010-07-15 14:27:13+00:00
4,5,public Authentication getAuthentication(Ht...,1,2010-07-15 14:27:13+00:00
5,7,public SAXParser newSAXParser() throws Par...,1,2010-12-03 16:35:02-08:00
6,8,public SAXParser newSAXParser() {\n ...,0,2010-12-03 16:35:02-08:00
...,...,...,...,...
5651,7856,"private int verify(\n Message m, byte[]...",1,2024-07-21 13:34:12+02:00
5652,7858,private Message maybeAddToCache(Message mess...,1,2024-07-21 13:34:12+02:00
5653,7860,static void main(String[] args) throws Excep...,1,2024-07-21 13:34:12+02:00
5649,7853,NioUdpClient() {\n // https://datatracker...,0,2024-07-21 13:34:12+02:00


In [35]:

split_index = int(len(df) * 0.8)
train_df = df.iloc[:split_index]
test_df = df.iloc[split_index:]

In [42]:
print(len(train_df[train_df['label'] == 0]))
print(len(train_df[train_df['label'] == 1]))
print(len(test_df[test_df['label'] == 0]))
print(len(test_df[test_df['label'] == 1]))

2278
2291
578
565


Unnamed: 0,index,text,label,committer_date
2791,3943,public void beforeEach()\n {\n w...,0,2023-02-28 11:48:06+01:00
2790,3941,public void before(Class<?> testClass) thr...,1,2023-02-28 11:48:06+01:00
2802,3959,void testEq1ConfigClassIsNew() throws Exce...,1,2023-02-28 17:13:51+01:00
2803,3960,void escapeInfoMessageInternalDocumentPara...,0,2023-02-28 17:13:51+01:00
2801,3958,void testEq1ConfigClassIsNew() throws Exce...,0,2023-02-28 17:13:51+01:00
...,...,...,...,...
5651,7856,"private int verify(\n Message m, byte[]...",1,2024-07-21 13:34:12+02:00
5652,7858,private Message maybeAddToCache(Message mess...,1,2024-07-21 13:34:12+02:00
5653,7860,static void main(String[] args) throws Excep...,1,2024-07-21 13:34:12+02:00
5649,7853,NioUdpClient() {\n // https://datatracker...,0,2024-07-21 13:34:12+02:00


In [43]:
train_df.to_json('java_notimetravelling_train.json', orient='records')
test_df.to_json('java_notimetravelling_test.json', orient='records')