In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [10]:
data = pd.read_csv('gdpr_violations.csv')
data.head()

Unnamed: 0,id,picture,name,price,authority,date,controller,article_violated,type,source,summary
0,1,https://www.privacyaffairs.com/wp-content/uplo...,Poland,9380,Polish National Personal Data Protection Offic...,10/18/2019,Polish Mayor,Art. 28 GDPR,Non-compliance with lawful basis for data proc...,https://uodo.gov.pl/decyzje/ZSPU.421.3.2019,No data processing agreement has been conclude...
1,2,https://www.privacyaffairs.com/wp-content/uplo...,Romania,2500,Romanian National Supervisory Authority for Pe...,10/17/2019,UTTIS INDUSTRIES,Art. 12 GDPR|Art. 13 GDPR|Art. 5 (1) c) GDPR|A...,Information obligation non-compliance,https://www.dataprotection.ro/?page=A_patra_am...,A controller was sanctioned because he had unl...
2,3,https://www.privacyaffairs.com/wp-content/uplo...,Spain,60000,Spanish Data Protection Authority (AEPD),10/16/2019,Xfera Moviles S.A.,Art. 5 GDPR|Art. 6 GDPR,Non-compliance with lawful basis for data proc...,https://www.aepd.es/resoluciones/PS-00262-2019...,The company had unlawfully processed the perso...
3,4,https://www.privacyaffairs.com/wp-content/uplo...,Spain,8000,Spanish Data Protection Authority (AEPD),10/16/2019,Iberdrola Clientes,Art. 31 GDPR,Failure to cooperate with supervisory authority,https://www.aepd.es/resoluciones/PS-00304-2019...,Iberdrola Clientes violated Article 13 of the ...
4,5,https://www.privacyaffairs.com/wp-content/uplo...,Romania,150000,Romanian National Supervisory Authority for Pe...,10/09/2019,Raiffeisen Bank SA,Art. 32 GDPR,Failure to implement sufficient measures to en...,https://www.dataprotection.ro/?page=Comunicat_...,Raiffeisen Bank Romania did not observe the ne...


In [11]:
article_encoder = LabelEncoder()
data["article_violated_encoded"] = article_encoder.fit_transform(data["article_violated"])

# Risk Assessment Function
def risk_assessment(article):
    high_risk = ["Art. 5 GDPR", "Art. 6 GDPR", "Art. 32 GDPR"]
    medium_risk = ["Art. 12 GDPR", "Art. 13 GDPR", "Art. 31 GDPR"]
    return "High" if article in high_risk else "Medium" if article in medium_risk else "Low"


In [4]:
# Apply Risk Assessment
data["risk_level"] = data["article_violated"].apply(risk_assessment)

# Encode `risk_level`
risk_encoder = LabelEncoder()
data["risk_level_encoded"] = risk_encoder.fit_transform(data["risk_level"])

In [5]:
# Feature Engineering
vectorizer = TfidfVectorizer()
X_text = vectorizer.fit_transform(data['summary'])


In [6]:
# Train-Test Split
X_train, X_test, y_train_articles, y_test_articles, y_train_risk, y_test_risk = train_test_split(
    X_text, data["article_violated_encoded"], data["risk_level_encoded"], test_size=0.2, random_state=42
)

# Train Model for Predicting `article_violated`
article_model = RandomForestClassifier(n_estimators=100, random_state=42)
article_model.fit(X_train, y_train_articles)


In [8]:
# Predictions for `article_violated`
y_pred_articles = article_model.predict(X_test)

# Convert predicted articles back to original labels
y_pred_articles_real = article_encoder.inverse_transform(y_pred_articles)
print("Predicted Violations:", y_pred_articles_real)


Predicted Violations: ['Art. 5 (1) GDPR|Art. 6 GDPR|Art. 7 GDPR'
 'Art. 5 (1) a) GDPR|Art. 6 GDPR'
 'Art. 5 (1) c) GDPR|Art. 12 GDPR|Art. 13 GDPR|Art. 32 GDPR'
 'Art. 5 (1) a) GDPR' 'Art. 6 GDPR' 'Art. 6 GDPR' 'Art. 6 GDPR'
 'Art. 5 GDPR|Art. 6 GDPR' 'Art. 32 GDPR'
 'Art. 5 GDPR|Art. 6 GDPR|Art. 21 GDPR' 'Art. 32 GDPR'
 'Art. 5 GDPR|Art. 6 GDPR' 'Art. 13 GDPR'
 'Art. 5 GDPR|Art. 6 GDPR|Art. 7 GDPR|Art. 21 GDPR' 'Art. 13 GDPR'
 'Art. 5 (1) f) GDPR|Art. 32 GDPR' 'Art. 15 (1), (3) GDPR'
 'Art. 6 (1) GDPR' 'Art. 32 GDPR' 'Art. 32 GDPR|Art. 33 GDPR'
 'Art. 5 (1) a) GDPR|Art. 6 (1) a) GDPR' 'Art. 12 (4) GDPR|Art. 15 GDPR'
 'Art. 5 (1) a)|Art. 7 (3) GDPR' 'Art. 5 GDPR|Art. 6 GDPR'
 'Art. 5 (1) c) GDPR' 'Art. 6 GDPR' 'Art. 15 GDPR' 'Art. 6 GDPR'
 'Art. 5 (1) f) GDPR|Art. 32 GDPR' 'Art. 6 GDPR|Art. 5 (1) a) GDPR'
 'Art. 5 (1) f) GDPR|Art. 32 GDPR' 'Art. 6 (1) GDPR' 'Art. 6 GDPR'
 'Art. 13 GDPR|Art. 14 GDPR|Art. 6 GDPR|Art. 4 GDPR|Art. 5 GDPR'
 'Art. 58 GDPR' 'Art. 13 GDPR' 'Art. 6 GDPR' 'Art. 3

In [9]:
# Train Model for Predicting `risk_level`
risk_model = RandomForestClassifier(n_estimators=100, random_state=42)
risk_model.fit(X_train, y_train_risk)  # Fix: Use X_train instead of labels

# Predict `risk_level`
y_pred_risk = risk_model.predict(X_test)

# Convert predicted risk levels back to original labels
y_pred_risk_real = risk_encoder.inverse_transform(y_pred_risk)
print("Predicted Risk Levels:", y_pred_risk_real)


Predicted Risk Levels: ['Low' 'Low' 'Low' 'Low' 'Low' 'High' 'High' 'Low' 'Low' 'Low' 'High'
 'Low' 'Medium' 'Low' 'Medium' 'Low' 'Low' 'Low' 'High' 'Low' 'Low' 'Low'
 'Low' 'Low' 'Low' 'High' 'Low' 'Low' 'Low' 'Low' 'Low' 'Low' 'Low' 'Low'
 'Low' 'Medium' 'Low' 'Low' 'High' 'Low' 'Low' 'Low' 'Low' 'High' 'Low'
 'High' 'Low' 'Low' 'Low' 'Low' 'Low' 'High' 'Low' 'Low' 'Low' 'High'
 'Low' 'High' 'Low' 'Low' 'Low' 'Low' 'Low' 'Low' 'Low' 'High' 'High'
 'High' 'High' 'High' 'Low' 'Low' 'Low' 'High' 'Low' 'Low' 'Low' 'High'
 'Low' 'Low' 'Low' 'Low' 'Low' 'Low' 'High' 'High' 'Low' 'Low']
