# Lime
- 비속어 검출, 분포도 확인

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install lime

In [None]:
import pandas as pd
import csv

In [None]:
train = pd.read_excel("/content/drive/MyDrive/bad_content_dectection/train.xlsx")

In [None]:
train.head()

In [None]:
x_train_text = train.comment_text
y_train = train[train.columns[2:]].sum(axis=1).map(lambda x: (int)(min(x, 1)))

In [None]:
print(x_train_text.head(), y_train.head())

In [None]:
from matplotlib import pyplot as plt

plt.hist(y_train, bins=2)
# 데이터 분포 확인

In [None]:
import sklearn
import sklearn.metrics
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vectorizer = TfidfVectorizer()
# 아래 TF-IDF를 적용하면 결과가 개선된다.
# vectorizer = TfidfVectorizer(min_df=10, stop_words='english')
x_train = vectorizer.fit_transform(x_train_text)

In [None]:
x_train[0]

In [None]:
from sklearn.naive_bayes import MultinomialNB

# 학습모델
clf = MultinomialNB()
clf.fit(x_train, y_train)

In [None]:
# 학습 정확도 확인
pred = clf.predict(x_train)

print("정확도: ", sklearn.metrics.accuracy_score(y_train, pred))
print("혼돈 매트릭스: \n", sklearn.metrics.confusion_matrix(y_train, pred, normalize='pred'))

In [None]:
def get_text_by_y_pred(y, p):
  return [i for i in range(x_train.shape[0]) if y_train[i] == y and pred[i] == p]

# 혼돈 매트릭스에 각 원소에 해당하는 훈련값들의 인덱스를 가져옵니다.
tp = get_text_by_y_pred(0, 0)
fn = get_text_by_y_pred(0, 1)
fp = get_text_by_y_pred(1, 0)
tn = get_text_by_y_pred(1, 1)

In [None]:
from sklearn.pipeline import make_pipeline
from lime.lime_text import LimeTextExplainer

def predict_pipe(x):
  x = vectorizer.transform(x)
  x = clf.predict(x)
  return x

# i 번째 훈련 데이터를 Lime으로 분석해서 노트북에 표시합니다.
def explain(i, text=None, cls=None):
  if not text and not cls:
    text, cls = x_train_text[i], y_train[i]

  class_names=["Normal", "Toxic"]

  # Lime 분석
  pipe = make_pipeline(vectorizer, clf)
  explainer = LimeTextExplainer(class_names=class_names)
  exp = explainer.explain_instance(text, pipe.predict_proba)

  # 분류기로 예측한 결과 표시
  pred = clf.predict(vectorizer.transform([text])[0])
  pred = "Toxic" if(pred == 1) else "Normal"
  cls = "Toxic" if(cls == 1) else "Normal"

  print()
  print(f"#{i} Predict: {pred} Real: {cls}")

  # 노트북에 표시
  exp.show_in_notebook(text=text)
  print()

In [None]:
import numpy as np

In [None]:
#for i in np.random.choice(fn, 10):
for i in [51115, 152131, 155475, 90658, 114588]:
  explain(i)

In [None]:
eval("10-5")

In [None]:
text = """He's dead.  Watching it live.  mms://a352.l5671334351.c56713.n.lm.akamaistream.net/D/352/56713/v0001/reflector:34351"""
#eval(text, 1)
#explain(text, 1)
explain(1, text)

In [None]:
shorts = [i for i in fp if len(x_train_text[i]) < 100]
longs = [i for i in fp if len(x_train_text[i]) >= 100]

shorts = np.random.choice(shorts, 5)
longs = np.random.choice(longs, 5)

print(shorts)
for i in shorts:
  explain(i)

print(longs)
for i in longs:
  explain(i)

In [None]:
explain(0, "If there was a a god, I would know that you are going to hell, for being dishonest | immoral, but your days on Earth, in freedom [which you oppress] are numbered, until you go to jail or worse.", 1)

In [None]:
# 문자열에서 악성 단어를 찾아내서 *로 바꿉니다.
def filter_toxic(text):
  pipe = make_pipeline(vectorizer, clf)
  pred = clf.predict(vectorizer.transform([text])[0])

  explainer = LimeTextExplainer(class_names=[0, 1])
  exp = explainer.explain_instance(text, pipe.predict_proba)

  result = exp.as_list()
  toxic_list = [x[0] for x in result if x[1] >= 0.1]

  filtered = text
  for toxic in toxic_list:
    filtered = filtered.replace(toxic, "*" * len(toxic))

  print("=====================")
  print("Original Text: \n{}\n\nFiltered: \n{}".format(text, filtered))
  print("=====================")

In [None]:
filter_toxic("Please turn off your fucking cell phone please")

filter_toxic("""
You are wasting your time. The Fascists of Wakopedia will never allow anything bad to be awritten about a Liberal.
Jesus you can trash all you want Wakopedia loves when people do that but don't touch a liberal or you will be banned.
"""
)

filter_toxic("""
uan pablo montoya

Bold texthe drives fast cars and likes hot girls he is a pimp
""")