In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing libraries

In [None]:
from tqdm import tqdm
import collections

In [None]:
from sklearn.metrics import accuracy_score, f1_score

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import re

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
import emoji

# Reading data

In [None]:
fil = open("../input/multilingualabusivecomment/ShareChat-IndoML-Datathon-NSFW-CommentChallenge_Train.csv", "r")
first_row = fil.readline()
columns = first_row.strip("\n").split(",")
columns
data = dict(zip(columns, [[] for i in range(len(columns))]))
og_rows = 0
for line in fil.readlines():
    og_rows += 1
    cells = line.strip("\n").split(",")
    if len(cells) < len(columns):
        continue
    data['CommentId'].append(cells[0])
    data['commentText'].append(",".join(cells[1:-8]))
    for i in range(1,9):
         data[columns[-i]].append(cells[-i])
data = pd.DataFrame(data)
data.tail(20)

In [None]:
data.head(20)

In [None]:
data[np.any(data.isna(), axis=1)].tail(20)

In [None]:
def intify(x):
    try:
        return np.int32(x)
    except:
        return np.nan

In [None]:
data["CommentId"] = data["CommentId"].astype(np.uint32)
data["user_index"] = data["user_index"].apply(lambda x: intify(x))
data["post_index"] = data["post_index"].apply(lambda x: intify(x))
data["report_count_comment"] = data["report_count_comment"].apply(lambda x: intify(x))
data["report_count_post"] = data["report_count_post"].apply(lambda x: intify(x))
data["like_count_comment"] = data["like_count_comment"].apply(lambda x: intify(x))
data["like_count_post"] = data["like_count_post"].apply(lambda x: intify(x))
data["label"] = data["label"].apply(lambda x: intify(x))

Checking bad rows

In [None]:
data[np.any(data.isna(), axis=1)]

In [None]:
data.dropna(inplace=True, axis=0)
data.head(30)

In [None]:
data["language"].value_counts()

Checking row with bad language

In [None]:
data[data["language"] == '21']

In [None]:
data = data[data["language"] != '21']
data.head(20)

Language Distribution

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(data=data, x="language")
plt.xticks(rotation=40)
plt.show()

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(data=data, x="language", hue="label")
plt.xticks(rotation=40)
plt.show()

In [None]:
columns

In [None]:
# data[["label", "user_index"]].groupby(by="user_index").count()

In [None]:
data["commentText"] = data["commentText"].apply(lambda x: re.sub("[.,\"'-/:\d+]", "", x))
data.head(30)

In [None]:
data.tail(30)

In [None]:
def give_emoji_free_text(text):
    return emoji.get_emoji_regexp().sub(r'', text)

data["commentText"] = data["commentText"].apply(give_emoji_free_text)

In [None]:
data[["commentText", "label"]].to_csv("train_data.csv", index=None)

# Vocab

In [None]:
ok_eng_words = []
for text in tqdm(data[data["label"]==0]["commentText"]):
    ok_eng_words.extend(list(map(lambda x: x.strip(" ") if x.strip(" ").isalnum() else "", text.split(" "))))
print(len(ok_eng_words))
ok_eng_counts = collections.Counter(ok_eng_words)
ok_eng_words = set(map(lambda x: x[0] if x[1]>250 else "", ok_eng_counts.most_common(1000)))
len(list(ok_eng_words))

In [None]:
# Simple sanity check for setting safewords
"randi" in ok_eng_words  # fails for >1200 common-words

In [None]:
ok_ot_words = []
for text in tqdm(data[data["label"]==0]["commentText"]):
    ok_ot_words.extend(list(map(lambda x: x.strip('"').strip(",").strip(".").strip(" ") if not x.strip('"').strip(",").strip(".").strip(" ").isalnum() else "", text.split(" "))))
print(len(ok_ot_words))
ok_ot_counts = collections.Counter(ok_ot_words)
ok_ot_words = set(map(lambda x: x[0] if x[1]>50 else "", ok_ot_counts.most_common(2000)))
len(list(ok_ot_words))

In [None]:
blacklist = []
sus_words = []
for text in tqdm(data[data["label"]==1]["commentText"]):
    sus_words.extend(list(map(lambda x: x.strip('"').strip(",").strip(".").strip(" ") if x.strip('"').strip(",").strip(".").strip(" ").isalnum() else "", text.split(" "))))
sus_counts = collections.Counter(sus_words)
for i in sus_counts.most_common(1000):
    if (i[1] > 100) and (i[0] not in ok_eng_words):
        blacklist.append(i[0])

sus_words = []
for text in tqdm(data[data["label"]==1]["commentText"]):
    sus_words.extend(list(map(lambda x: x.strip('"').strip(",").strip(".").strip(" ") if not x.strip('"').strip(",").strip(".").strip(" ").isalnum() else "", text.split(" "))))
sus_counts = collections.Counter(sus_words)
for i in sus_counts.most_common(1000):
    if (i[1] > 100) and (i[0] not in ok_ot_words):
        blacklist.append(i[0])
len(blacklist)

In [None]:
greylist = []
sus_words = []
for text in tqdm(data[data["label"]==1]["commentText"]):
    sus_words.extend(list(map(lambda x: x.strip('"').strip(",").strip(".").strip(" ") if x.strip('"').strip(",").strip(".").strip(" ").isalnum() else "", text.split(" "))))
sus_counts = collections.Counter(sus_words)
for i in sus_counts.most_common(3000):
    if (i[1] > 100) and (i[0] not in ok_eng_words):
        greylist.append(i[0])

sus_words = []
for text in tqdm(data[data["label"]==1]["commentText"]):
    sus_words.extend(list(map(lambda x: x.strip('"').strip(",").strip(".").strip(" ") if not x.strip('"').strip(",").strip(".").strip(" ").isalnum() else "", text.split(" "))))
sus_counts = collections.Counter(sus_words)
for i in sus_counts.most_common(3000):
    if (i[1] > 70) and (i[0] not in ok_ot_words):
        greylist.append(i[0])
len(greylist)

In [None]:
print(" ".join(blacklist[::-1][:150]))
print(" ".join(blacklist[:150]))

In [None]:
print(" ".join(blacklist))

# FE and Model

In [None]:
# count_vectorizer = CountVectorizer(vocabulary=blacklist)
# X = count_vectorizer.fit_transform(data["commentText"].apply(lambda x: re.sub(",.","",x))).toarray()
# print(X.shape)

In [None]:
tf_vectorizer = TfidfVectorizer(vocabulary=list(set(greylist)))
X = tf_vectorizer.fit_transform(data["commentText"].apply(lambda x: re.sub(",.","",x))).toarray()
print(X.shape)

In [None]:
Y = data["label"]
print(Y.shape)

In [None]:
# subset_index = np.where(data["commentText"].apply(lambda x: True if np.any([word in set(blacklist) for word in list(map(lambda y: y.strip(",").strip(".").strip(" "), x.split(" ")))]) else False))
# subset_index[0].shape

In [None]:
# subset_index[0][-10:]

In [None]:
# print(accuracy_score(np.array(Y)[subset_index[0]], np.array(Y)[subset_index[0]]*0+1))
# print(f1_score(np.array(Y)[subset_index[0]], np.array(Y)[subset_index[0]]*0+1))

In [None]:
# X[subset_index[0]].shape

In [None]:
# np.array(Y)[subset_index[0]].sum()

In [None]:
# data.loc[subset_index].head(30)

In [None]:
# data.loc[subset_index].tail(30)

In [None]:
# del(data)

In [None]:
# X_train, X_test, Y_train, Y_test = train_test_split(X, Y)

In [None]:
# del(X)
# del(Y)

In [None]:
from sklearn.naive_bayes import CategoricalNB
model = CategoricalNB()

In [None]:
# model.fit(X, Y)
# print(accuracy_score(model.predict(X), Y))
# print(f1_score(model.predict(X), Y))

In [None]:
# model.fit(X_train.toarray(), Y_train)
# print(accuracy_score(X_test.toarray(), Y_test))
# print(f1_score(model.predict(X_test.toarray()), Y_test))

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# model = RandomForestClassifier(max_depth=5)
# from sklearn.linear_model import SGDClassifier
# model = SGDClassifier(verbose=1, max_iter=2)
# model.fit(X[:-1000], Y[:-1000])
# print(accuracy_score(model.predict(X[-1000:]),Y[-1000:]))
# print(f1_score(model.predict(X[-1000:]),Y[-1000:]))

In [None]:
X

In [None]:
Y

In [None]:

from catboost import CatBoostClassifier

current_seed = 27
print("Using seed:", current_seed)

model = CatBoostClassifier()
print("Model fitting...")
model.fit(X, Y)
# model.booster_.save_model("model_booster_weights.txt", num_iteration=model.best_iteration_)
print("Training end.")

In [None]:
print(accuracy_score(model.predict(X[-10000:]),Y[-10000:]))
print(f1_score(model.predict(X[-10000:]),Y[-10000:]))

In [None]:
# model.booster_.save_model("model_booster_weights.txt", num_iteration=model.best_iteration_)

In [None]:
Y[-1000:].sum()

Test Data

In [None]:
fil = open("../input/multilingualabusivecomment/ShareChat-IndoML-Datathon-NSFW-CommentChallenge_Test_20_Percent_NoLabel.csv", "r")
first_row = fil.readline()
columns = first_row.strip("\n").split(",")
remaining = []
test_data = dict(zip(columns, [[] for i in range(2)]))
for line in fil.readlines():
    cells = line.strip("\n").split(",")
    if len(cells) < len(columns):
        test_data['CommentId'].append(cells[0])
        test_data['commentText'].append("")
        continue
    test_data['CommentId'].append(cells[0])
    test_data['commentText'].append(",".join(cells[1:-7]))
test_data = pd.DataFrame(test_data)
test_data.tail(20)

In [None]:
# test_data["commentText"] = test_data["commentText"].apply(lambda x: re.sub("[.,\"'-/:\d+]", "", x))
# test_data["commentText"] = test_data["commentText"].apply(give_emoji_free_text)
test_data["label"] = np.int32(model.predict(tf_vectorizer.fit_transform(test_data["commentText"].apply(lambda x: re.sub(",.","",x))).toarray()))
test_data.tail(25)

In [None]:
" " in blacklist

In [None]:
labels_2 = data["commentText"].apply(lambda x: 1 if ((len(x)>1) & np.any([word in set(blacklist) for word in list(map(lambda y: y.strip(",").strip(".").strip(" "), x.split(" ")))])) else 0)
test_data["label"] = np.clip(0,1, test_data["label"]+labels_2)

In [None]:
test_data["label"] = np.int32(test_data["label"])

In [None]:
test_data.head(50)

In [None]:
test_data[["CommentId", "label"]].to_csv("submission.csv", index=None)