In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/hate-norm/train.csv
/kaggle/input/hate-norm/test.csv


In [2]:
!pip install sklearn-crfsuite


Collecting sklearn-crfsuite
  Downloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl.metadata (3.8 kB)
Collecting python-crfsuite>=0.8.3 (from sklearn-crfsuite)
  Downloading python_crfsuite-0.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Downloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl (12 kB)
Downloading python_crfsuite-0.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.10 sklearn-crfsuite-0.3.6


In [3]:
import pandas as pd
from sklearn_crfsuite import CRF
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Load data from train.csv
train_df = pd.read_csv("/kaggle/input/hate-norm/train.csv", delimiter="|", names=["id", "sentence", "span", "bio"])

# Feature extraction function
def word2features(sentence, i):
    word = sentence[i]
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }
    if i > 0:
        word1 = sentence[i-1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True
    if i < len(sentence)-1:
        word1 = sentence[i+1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True
    return features

def sentence2features(sentence):
    return [word2features(sentence, i) for i in range(len(sentence))]

def sentence2labels(sentence):
    labels = sentence['bio'].split()
    # print(labels)
    if len(sentence['sentence'].split()) != len(labels):
        # Handle mismatch by truncating or padding labels
        if len(labels) > len(sentence['sentence'].split()):
            labels = labels[:len(sentence['sentence'].split())]
        else:
            labels.extend(['O'] * (len(sentence['sentence'].split()) - len(labels)))
    return labels


# Prepare data
X = train_df['sentence'].str.split().apply(sentence2features).tolist()
y = train_df.apply(sentence2labels, axis=1)
# print(len(X), len(y))

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# print(X)
# print(y_train)
# print("Hello")
# Train CRF model
crf = CRF(algorithm='lbfgs',
          c1=0.1,
          c2=0.1,
          max_iterations=100,
          all_possible_transitions=True)
crf.fit(X_train, y_train)

# Evaluate model
y_pred = crf.predict(X_test)
# print(classification_report(y_test, y_pred))

# Predict on new data (test.csv)
test_df = pd.read_csv("/kaggle/input/hate-norm/test.csv", delimiter="|", names=["id", "sentence"])
X_test_new = test_df['sentence'].str.split().apply(sentence2features).tolist()
y_pred_new = crf.predict(X_test_new)

# Append predicted BIO tags to test.csv
# Convert y_pred_new to string
y_pred_new = [' '.join(i) for i in y_pred_new]
test_df['bio'] = y_pred_new
output_df = test_df[['id', 'bio']]
output_df.to_csv("test_with_bio.csv", sep=",", index=False)

# print(y_pred)


In [4]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report

# Convert sequences of sequences to binary arrays
mlb = MultiLabelBinarizer()
y_test_bin = mlb.fit_transform(y_test)
y_pred_bin = mlb.transform(y_pred)

# Print classification report
print(classification_report(y_test_bin, y_pred_bin, target_names=mlb.classes_))


              precision    recall  f1-score   support

           B       1.00      0.89      0.94       485
           I       0.78      0.67      0.72       246
           O       0.99      1.00      0.99       479

   micro avg       0.96      0.89      0.92      1210
   macro avg       0.92      0.85      0.88      1210
weighted avg       0.95      0.89      0.92      1210
 samples avg       0.96      0.89      0.91      1210

