In [9]:
import requests
import zipfile
import io
import os

def download_and_extract_zip(url, target_dir):
    # Make a GET request to the URL
    response = requests.get(url)
    
    # Raise an exception for a non-200 status code
    response.raise_for_status()
    
    # Create a BytesIO object from the response content
    zip_content = io.BytesIO(response.content)
    
    # Open the ZIP file
    with zipfile.ZipFile(zip_content, 'r') as zip_ref:
        # Extract all contents to the target directory
        zip_ref.extractall(target_dir)
    
    print("Extraction complete.")

# URL of the ZIP file
zip_url = "https://prod-dcd-datasets-cache-zipfiles.s3.eu-west-1.amazonaws.com/9sxpkmm8xn-1.zip"

# Directory where you want to extract the contents
target_directory = "./data"

# Check if the target directory already contains the extracted files
if not os.path.exists(target_directory):
    os.makedirs(target_directory)
    print("Directory created.")
    
    # Call the function to download and extract the ZIP file
    try:
        download_and_extract_zip(zip_url, target_directory)
    except requests.exceptions.RequestException as e:
        print(f"Failed to download and extract ZIP file: {e}")
else:
    print("Directory already exists. Skipping download and extraction.")


Directory already exists. Skipping download and extraction.


In [10]:
import pandas as pd

# Path to the CSV file
csv_file_path = "./data/A Curated Hate Speech Dataset/HSData/0_RawData/data_huang_devansh.csv"

# Read the CSV file into a Pandas DataFrame
df = pd.read_csv(csv_file_path)

# Display the first few rows of the DataFrame to check if it's loaded correctly
df.head()

Unnamed: 0,Content,Label
0,`- This is not ``creative``. Those are the di...,0
1,` :: the term ``standard model`` is itself le...,0
2,"True or false, the situation as of March 200...",0
3,"Next, maybe you could work on being less cond...",0
4,This page will need disambiguation.,0


In [11]:
# Print unique values in the "Label" column
unique_labels = df['Label'].unique()

# Display the unique values
print("Unique Labels:")
print(unique_labels)

Unique Labels:
[0 1]


In [12]:
# Count the occurrences of each label
label_counts = df['Label'].value_counts()

# Display the label counts
print("Label Counts:")
print(label_counts)

Label Counts:
Label
0    708641
1    133694
Name: count, dtype: int64


## Data Preprocessing

1. Remove multiple spaces, hyperlinks, user mentions, emojis, and emoticons converted to text, and removed new line characters
2. Removing date and time values
3. Removing accented numbers and characters (e.g., ^ea, or ^12)
4. The remaining numbers are converted to words
5. Removing ampersands from the beginning of words
6. Removing the following characters (_"\-;%()|+&=*%.,!?:#$@[]/) from the text

In [13]:
import re

def preprocess_text(text):
    # Check if the input is a non-null string
    if isinstance(text, str) and not pd.isnull(text):
        # Remove multiple spaces
        text = re.sub(r'\s+', ' ', text)

        # Remove hyperlinks
        text = re.sub(r'http\S+', '', text)

        # Remove user mentions (assuming mentions start with @)
        text = re.sub(r'@\w+', '', text)

        # Remove emojis and emoticons
        emoji_pattern = re.compile("["
                                   u"\U0001F600-\U0001F64F"  # emoticons
                                   # Add more ranges as needed
                                   "]+", flags=re.UNICODE)
        text = emoji_pattern.sub(r'', text)

        # Remove new line characters
        text = text.replace('\n', '')

        return text
    else:
        # Return an empty string for NaN values
        return ''

# Create a new DataFrame with preprocessed content
df_preprocessed = pd.DataFrame({
    'Content': df['Content'].apply(preprocess_text),
    'Label': df['Label']
})

# Display the first few rows of the new DataFrame
df_preprocessed.head()

Unnamed: 0,Content,Label
0,`- This is not ``creative``. Those are the dic...,0
1,` :: the term ``standard model`` is itself les...,0
2,"True or false, the situation as of March 2002...",0
3,"Next, maybe you could work on being less cond...",0
4,This page will need disambiguation.,0


In [14]:
from word2number import w2n  # Library for converting numbers to words
from unidecode import unidecode  # Library for removing accented characters

# Function to remove date and time values
def remove_date_time(text):
    # Implement your logic to remove date and time values
    # For example, you can use regular expressions to identify and remove them
    # Here's a simple example that removes strings with digits and colons
    return re.sub(r'\b\d{1,2}:\d{2}\b|\b\d{1,2}/\d{1,2}/\d{2,4}\b', '', text)

# Function to remove accented numbers and characters
def remove_accented_chars(text):
    return unidecode(text)

# Function to convert remaining numbers to words
def convert_numbers_to_words(text):
    # Replace numerical values with their word representations
    words = []
    for word in text.split():
        try:
            words.append(w2n.word_to_num(word))
        except ValueError:
            # Handle the case where w2n.word_to_num raises a ValueError
            words.append(word)
        except IndexError:
            # Handle the case where the list is empty
            pass
    return ' '.join(map(str, words))

# Function to remove ampersands from the beginning of words
def remove_ampersands(text):
    return re.sub(r'\b&(\w+)\b', r'\1', text)

# Function to remove specified characters from the text
def remove_special_characters(text):
    special_chars = r'_"\\;%\(\)|\+`&=*%,.!?:#$@[\]/-'
    return re.sub('[' + special_chars + ']', '', text)

# Apply the defined functions in sequence to the "Content" column
df_preprocessed['Content'] = df_preprocessed['Content'].apply(remove_date_time)
df_preprocessed['Content'] = df_preprocessed['Content'].apply(remove_accented_chars)
df_preprocessed['Content'] = df_preprocessed['Content'].apply(convert_numbers_to_words)
df_preprocessed['Content'] = df_preprocessed['Content'].apply(remove_ampersands)
df_preprocessed['Content'] = df_preprocessed['Content'].apply(remove_special_characters)

# Save the new DataFrame to a CSV file
df_preprocessed.to_csv('preprocessed_dataset.csv', mode='w', index=False)

# Display the first few rows of the new DataFrame
df_preprocessed.head()

Unnamed: 0,Content,Label
0,This is not creative Those are the dictionary...,0
1,the term standard model is itself less NPOV ...,0
2,True or false the situation as of March 2002 w...,0
3,Next maybe you could work on being less condes...,0
4,This page will need disambiguation,0


In [15]:
from sklearn.model_selection import train_test_split

# Assuming 'df_preprocessed' is your DataFrame with the preprocessed content and labels
X = df_preprocessed['Content']
y = df_preprocessed['Label']

# 60% Train 20% Test 20% Validate
# Split the data into training and temporary sets
X_train_temp, X_temp, y_train_temp, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_validate, X_test, y_validate, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

## Models

In [16]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Assuming X_train_temp, y_train_temp, X_validate, y_validate, X_test, and y_test are available
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train_temp)

X_train_sequences = tokenizer.texts_to_sequences(X_train_temp)
X_validate_sequences = tokenizer.texts_to_sequences(X_validate)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

vocab_size = len(tokenizer.word_index) + 1  # Adding 1 for the padding token

max_sequence_length = 100  # Adjust as needed
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_sequence_length, padding='post')
X_validate_padded = pad_sequences(X_validate_sequences, maxlen=max_sequence_length, padding='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_sequence_length, padding='post')

2023-10-07 08:09:41.314081: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-10-07 08:09:41.314108: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-10-07 08:09:41.314141: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-10-07 08:09:41.320588: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [17]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense, Dropout

# Assuming vocab_size, embedding_dim, max_sequence_length, and num_classes are defined

model = Sequential()

model.add(Embedding(vocab_size, 100, input_length=max_sequence_length))

model.add(Flatten())

model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))

model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))

# Output layer with sigmoid activation for binary classification
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Print model summary
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 100)          37690300  
                                                                 
 flatten (Flatten)           (None, 10000)             0         
                                                                 
 dense (Dense)               (None, 128)               1280128   
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense_2 (Dense)             (None, 1)                 6

2023-10-07 08:10:09.073067: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-10-07 08:10:09.081578: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-10-07 08:10:09.081834: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

In [18]:
model.fit(X_train_padded, y_train_temp, epochs=10, batch_size=32, validation_data=(X_validate_padded, y_validate))

loss, accuracy = model.evaluate(X_test_padded, y_test)
print(f'Test Accuracy: {accuracy}')

Epoch 1/10


2023-10-07 08:10:09.377552: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 202160400 exceeds 10% of free system memory.
2023-10-07 08:10:10.052494: I tensorflow/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2023-10-07 08:10:10.891512: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7f2538318970 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-10-07 08:10:10.891529: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 2080 SUPER, Compute Capability 7.5
2023-10-07 08:10:10.894716: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-10-07 08:10:10.905810: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:442] Loaded cuDNN version 8700
2023-10-07 08:10:10.966070: I ./tensorflow/compiler/jit/device

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 0.9214980006217957
