In [2]:
import pandas as pd

# Read the CSV file
df = pd.read_csv(r'D:\git\dscp\data\classification_raw.csv')

# Display the first few rows of the dataframe
df.head()

Unnamed: 0,file_name,category,verification_finished
0,gpt4o_mini-34206.c,VULNERABLE,yes
1,gpt4o_mini-34206.c,VULNERABLE,yes
2,gpt4o_mini-34206.c,VULNERABLE,yes
3,gemini_pro-14924.c,VULNERABLE,yes
4,gemini_pro-14924.c,VULNERABLE,yes


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 799335 entries, 0 to 799334
Data columns (total 3 columns):
 #   Column                 Non-Null Count   Dtype 
---  ------                 --------------   ----- 
 0   file_name              799335 non-null  object
 1   category               799335 non-null  object
 2   verification_finished  799335 non-null  object
dtypes: object(3)
memory usage: 18.3+ MB


In [4]:
# Group by file_name and check if there are multiple unique categories
file_name_category_counts = df.groupby('file_name')['category'].nunique()

# Filter file_names with more than one unique category
file_names_with_multiple_categories = file_name_category_counts[file_name_category_counts > 1]

file_names_with_multiple_categories

Series([], Name: category, dtype: int64)

In [5]:
# Group by file_name and check if there are multiple unique verification_finished values
file_name_verification_counts = df.groupby('file_name')['verification_finished'].nunique()

# Filter file_names with more than one unique verification_finished value
file_names_with_multiple_verifications = file_name_verification_counts[file_name_verification_counts > 1]

file_names_with_multiple_verifications

Series([], Name: verification_finished, dtype: int64)

In [6]:
verification_counts = df['verification_finished'].value_counts()
verification_counts

verification_finished
yes    769637
no      29698
Name: count, dtype: int64

In [7]:
category_counts = df['category'].value_counts()
category_counts

category
VULNERABLE        765366
NON-VULNERABLE     25674
PARSING ERROR       8295
Name: count, dtype: int64

In [8]:
df = df.drop_duplicates()
df.head()

Unnamed: 0,file_name,category,verification_finished
0,gpt4o_mini-34206.c,VULNERABLE,yes
3,gemini_pro-14924.c,VULNERABLE,yes
9,falcon180b-35726.c,VULNERABLE,yes
12,gemma7b-15674.c,NON-VULNERABLE,yes
13,gemma7b-6476.c,VULNERABLE,yes


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 226682 entries, 0 to 799333
Data columns (total 3 columns):
 #   Column                 Non-Null Count   Dtype 
---  ------                 --------------   ----- 
 0   file_name              226682 non-null  object
 1   category               226682 non-null  object
 2   verification_finished  226682 non-null  object
dtypes: object(3)
memory usage: 6.9+ MB


In [10]:
df = df[df['verification_finished'] != 'no']
df

Unnamed: 0,file_name,category,verification_finished
0,gpt4o_mini-34206.c,VULNERABLE,yes
3,gemini_pro-14924.c,VULNERABLE,yes
9,falcon180b-35726.c,VULNERABLE,yes
12,gemma7b-15674.c,NON-VULNERABLE,yes
13,gemma7b-6476.c,VULNERABLE,yes
...,...,...,...
799328,gpt35-52018.c,VULNERABLE,yes
799329,codellama_13b-4645.c,VULNERABLE,yes
799331,gpt35-4019.c,VULNERABLE,yes
799332,gpt35-67231.c,VULNERABLE,yes


In [11]:
# Filter the dataframe to keep only 'VULNERABLE' and 'NON-VULNERABLE' values in the 'category' column
df = df[df['category'].isin(['VULNERABLE', 'NON-VULNERABLE'])]

# Create a new column 'vulnerability' that is 1 if 'VULNERABLE' and 0 if 'NON-VULNERABLE'
df['vulnerability'] = df['category'].apply(lambda x: 1 if x == 'VULNERABLE' else 0)

df

Unnamed: 0,file_name,category,verification_finished,vulnerability
0,gpt4o_mini-34206.c,VULNERABLE,yes,1
3,gemini_pro-14924.c,VULNERABLE,yes,1
9,falcon180b-35726.c,VULNERABLE,yes,1
12,gemma7b-15674.c,NON-VULNERABLE,yes,0
13,gemma7b-6476.c,VULNERABLE,yes,1
...,...,...,...,...
799328,gpt35-52018.c,VULNERABLE,yes,1
799329,codellama_13b-4645.c,VULNERABLE,yes,1
799331,gpt35-4019.c,VULNERABLE,yes,1
799332,gpt35-67231.c,VULNERABLE,yes,1


In [12]:
df['model'] = df['file_name'].apply(lambda x: x.split('-')[0])
df

Unnamed: 0,file_name,category,verification_finished,vulnerability,model
0,gpt4o_mini-34206.c,VULNERABLE,yes,1,gpt4o_mini
3,gemini_pro-14924.c,VULNERABLE,yes,1,gemini_pro
9,falcon180b-35726.c,VULNERABLE,yes,1,falcon180b
12,gemma7b-15674.c,NON-VULNERABLE,yes,0,gemma7b
13,gemma7b-6476.c,VULNERABLE,yes,1,gemma7b
...,...,...,...,...,...
799328,gpt35-52018.c,VULNERABLE,yes,1,gpt35
799329,codellama_13b-4645.c,VULNERABLE,yes,1,codellama_13b
799331,gpt35-4019.c,VULNERABLE,yes,1,gpt35
799332,gpt35-67231.c,VULNERABLE,yes,1,gpt35


In [14]:
output_csv_file = 'D:\\git\\dscp\\data\\classification_cleaned.csv'
df[['file_name', 'vulnerability', 'model']].to_csv(output_csv_file, index=False)