In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np

In [2]:
train_ds = pd.read_csv('train.csv')

In [3]:
train_ds

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0


In [4]:
from tensorflow.keras.layers import TextVectorization

In [5]:
X = train_ds['comment_text']
y = train_ds[train_ds.columns[2:]].values

In [6]:
X

0         Explanation\nWhy the edits made under my usern...
1         D'aww! He matches this background colour I'm s...
2         Hey man, I'm really not trying to edit war. It...
3         "\nMore\nI can't make any real suggestions on ...
4         You, sir, are my hero. Any chance you remember...
                                ...                        
159566    ":::::And for the second time of asking, when ...
159567    You should be ashamed of yourself \n\nThat is ...
159568    Spitzer \n\nUmm, theres no actual article for ...
159569    And it looks like it was actually you who put ...
159570    "\nAnd ... I really don't think you understand...
Name: comment_text, Length: 159571, dtype: object

In [7]:
y

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [8]:
y.shape

(159571, 6)

In [9]:
text = train_ds.comment_text.str.lower()
lens = [len(s.split()) for s in text]
SEQUENCE_LENS = int(np.quantile(lens, 0.95))

In [10]:
SEQUENCE_LENS

230

In [11]:
vectorizer = TextVectorization(output_sequence_length=SEQUENCE_LENS,
                              output_mode='int')

vectorizer.adapt(X.values)

In [12]:
vectorizer.get_vocabulary()

['',
 '[UNK]',
 'the',
 'to',
 'of',
 'and',
 'a',
 'you',
 'i',
 'is',
 'that',
 'in',
 'it',
 'for',
 'this',
 'not',
 'on',
 'be',
 'as',
 'have',
 'are',
 'your',
 'with',
 'if',
 'article',
 'was',
 'or',
 'but',
 'page',
 'my',
 'an',
 'from',
 'by',
 'do',
 'at',
 'about',
 'me',
 'so',
 'wikipedia',
 'can',
 'what',
 'there',
 'all',
 'has',
 'will',
 'talk',
 'please',
 'would',
 'its',
 'no',
 'one',
 'just',
 'like',
 'they',
 'he',
 'dont',
 'which',
 'any',
 'been',
 'should',
 'more',
 'we',
 'some',
 'other',
 'who',
 'see',
 'here',
 'also',
 'his',
 'think',
 'im',
 'because',
 'know',
 'how',
 'am',
 'people',
 'why',
 'edit',
 'articles',
 'only',
 'out',
 'up',
 'when',
 'were',
 'use',
 'then',
 'may',
 'time',
 'did',
 'them',
 'now',
 'being',
 'their',
 'than',
 'thanks',
 'even',
 'get',
 'make',
 'good',
 'had',
 'very',
 'information',
 'does',
 'could',
 'well',
 'want',
 'such',
 'sources',
 'way',
 'name',
 'these',
 'deletion',
 'pages',
 'first',
 'help'

In [13]:
VOCAB_SIZE = vectorizer.vocabulary_size()

In [14]:
VOCAB_SIZE

257825

In [15]:
vectorized_text = vectorizer(X.values)

In [16]:
vectorized_text

<tf.Tensor: shape=(159571, 230), dtype=int64, numpy=
array([[   645,     76,      2, ...,      0,      0,      0],
       [219427,     54,   2489, ...,      0,      0,      0],
       [   425,    441,     70, ...,      0,      0,      0],
       ...,
       [ 32445,   7392,    383, ...,      0,      0,      0],
       [     5,     12,    534, ...,      0,      0,      0],
       [     5,      8,    130, ...,      0,      0,      0]])>

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
X_train, X_val, y_train, y_val = train_test_split(np.array(vectorized_text), y, 
                                                  test_size = 0.2, 
                                                  random_state = 142)

In [19]:
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((127656, 230), (31915, 230), (127656, 6), (31915, 6))

In [20]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Bidirectional, Dense, Embedding
from tensorflow.keras.losses import BinaryCrossentropy

In [21]:
model = Sequential()

model.add(Embedding(VOCAB_SIZE, 300))
model.add(Bidirectional(LSTM(32, activation='tanh')))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(6, activation='sigmoid'))

In [22]:
model.compile(optimizer='adam', loss=BinaryCrossentropy(), metrics=['accuracy'])

In [23]:
train_data= tf.data.Dataset.from_tensor_slices((X_train, y_train))
train_data= train_data.batch(32).prefetch(tf.data.AUTOTUNE)

val_data= tf.data.Dataset.from_tensor_slices((X_val, y_val))
val_data= val_data.batch(32).prefetch(tf.data.AUTOTUNE)

In [24]:
model.fit(train_data, epochs = 2, validation_data=val_data)

Epoch 1/2
[1m3990/3990[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2093s[0m 524ms/step - accuracy: 0.9575 - loss: 0.0844 - val_accuracy: 0.9938 - val_loss: 0.0515
Epoch 2/2
[1m3990/3990[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2112s[0m 529ms/step - accuracy: 0.9847 - loss: 0.0416 - val_accuracy: 0.9938 - val_loss: 0.0530


<keras.src.callbacks.history.History at 0x2ce75b550>

In [25]:
val_predict = model.predict(X_val)
(val_predict>0.5).astype(int)

[1m998/998[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 35ms/step


array([[1, 0, 1, 0, 1, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [26]:
test_ds = pd.read_csv('test.csv')
test_ds

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.
...,...,...
153159,fffcd0960ee309b5,". \n i totally agree, this stuff is nothing bu..."
153160,fffd7a9a6eb32c16,== Throw from out field to home plate. == \n\n...
153161,fffda9e8d6fafa9e,""" \n\n == Okinotorishima categories == \n\n I ..."
153162,fffe8f1340a79fc2,""" \n\n == """"One of the founding nations of the..."


In [27]:
test_text=test_ds.comment_text.str.lower()
text_ds = vectorizer(test_text)
test_predict = model.predict(text_ds)
(test_predict>0.5).astype(int)

[1m4787/4787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m166s[0m 35ms/step


array([[1, 0, 1, 0, 1, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 0, 1, 0, 1, 0]])

In [28]:
model.save('toxic_comment.keras')

In [None]:
from googleapiclient.discovery import build
api_key = str(input("Enter your API key: "))
video_id = str(input("Enter your video id: "))
max_comments = int(input("Enter numbers of comments: "))

youtube = build('youtube', 'v3', developerKey=api_key)
def get_video_comments(video_id, max_comments):
    comments = []
    total_comments = 0
    page_token = None
    
    while total_comments < max_comments:
        request = youtube.commentThreads().list(
            part='snippet',
            videoId=video_id,
            maxResults=min(100, max_comments - total_comments), 
            pageToken=page_token
        )
        
        response = request.execute()
        items = response.get('items', [])
        for item in items:
            comment = item['snippet']['topLevelComment']['snippet']
            comments.append({'id': comment['authorDisplayName'], 'comment_text': comment['textOriginal']})
            total_comments += 1
            if total_comments >= max_comments:
                break
        
        # Check if there are more pages of comments
        page_token = response.get('nextPageToken')
        if not page_token:
            break
    return comments


comments = get_video_comments(video_id,max_comments)

df_comments = pd.DataFrame(comments)
df_comments

In [None]:
ytb_comments = df_comments.comment_text.str.lower()
ytb_text = vectorizer(ytb_comments)

ytb_predict = model.predict(ytb_text)
result = (ytb_predict>0.5).astype(int)
(ytb_predict>0.5).astype(int)

In [None]:
toxic = np.sum(result[:, 0])
severe_toxic = np.sum(result[:, 1])
obscene = np.sum(result[:, 2])
threat = np.sum(result[:, 3])
insult = np.sum(result[:, 4])
identity_hate = np.sum(result[:, 5])

In [None]:
print("Toxic comments: ", toxic)
print("Severe toxic comments: ", severe_toxic)
print("Obscene comments: ", obscene)
print("Threat comments: ", threat)
print("Insult comments: ", insult)
print("Identity Hate comments: ", identity_hate)