### Install and import necessary packages

In [None]:
!pip install -q pyicu
!pip install -q pycld2
!pip install -q polyglot
!pip install -q textstat
!pip install -q googletrans

In [None]:
import warnings
warnings.filterwarnings("ignore")

import os
import gc
import re
import folium
import textstat
from scipy import stats
from colorama import Fore, Back, Style, init

import math
import numpy as np
import scipy as sp
import pandas as pd

import random
import networkx as nx
from pandas import Timestamp

from PIL import Image
from IPython.display import SVG
from keras.utils import model_to_dot

import requests
from IPython.display import HTML

import seaborn as sns
from tqdm import tqdm
import matplotlib.cm as cm
import matplotlib.pyplot as plt

tqdm.pandas()

import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots

import transformers
import tensorflow as tf

from tensorflow.keras.callbacks import Callback
from sklearn.metrics import accuracy_score, roc_auc_score
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, CSVLogger

from tensorflow.keras.models import Model
from kaggle_datasets import KaggleDatasets
from tensorflow.keras.optimizers import Adam
from tokenizers import BertWordPieceTokenizer
from tensorflow.keras.layers import Dense, Input, Dropout, Embedding
from tensorflow.keras.layers import LSTM, GRU, Conv1D, SpatialDropout1D

from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras import activations
from tensorflow.keras import constraints
from tensorflow.keras import initializers
from tensorflow.keras import regularizers

import tensorflow.keras.backend as K
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import *
from tensorflow.keras.activations import *
from tensorflow.keras.constraints import *
from tensorflow.keras.initializers import *
from tensorflow.keras.regularizers import *

from sklearn import metrics
from sklearn.utils import shuffle
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer,\
                                            CountVectorizer,\
                                            HashingVectorizer

from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.tokenize import word_tokenize
from nltk.tokenize import TweetTokenizer  

import nltk
from textblob import TextBlob

from nltk.corpus import wordnet
from nltk.corpus import stopwords
from googletrans import Translator
from nltk import WordNetLemmatizer
from polyglot.detect import Detector
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud, STOPWORDS
from nltk.sentiment.vader import SentimentIntensityAnalyzer

stopword=set(STOPWORDS)

lem = WordNetLemmatizer()
tokenizer=TweetTokenizer()

np.random.seed(0)

### Load the training, validation, and testing datasets

In [None]:
DATA_PATH = "/kaggle/input/jigsaw-multilingual-toxic-comment-classification/"
os.listdir(DATA_PATH)

In [None]:
TEST_PATH = DATA_PATH + "test.csv"
VAL_PATH = DATA_PATH + "validation.csv"
TRAIN_PATH = DATA_PATH + "jigsaw-toxic-comment-train.csv"

val_data = pd.read_csv(VAL_PATH)
test_data = pd.read_csv(TEST_PATH)
train_data = pd.read_csv(TRAIN_PATH)

In [None]:
train_data.head()

In [None]:
val_data.head()

In [None]:
test_data.head()

### Wordcloud of all comments

In [None]:
def nonan(x):
    if type(x) == str:
        return x.replace("\n", "")
    else:
        return ""

text = ' '.join([nonan(abstract) for abstract in train_data["comment_text"]])
wordcloud = WordCloud(max_font_size=None, background_color='black', collocations=False,
                      width=1200, height=1000).generate(text)
fig = px.imshow(wordcloud)
fig.update_layout(title_text='Common words in comments')

## Languages <a id="1.2"></a>





In [None]:
def get_language(text):
    return Detector("".join(x for x in text if x.isprintable()), quiet=True).languages[0].name

train_data["lang"] = train_data["comment_text"].progress_apply(get_language)

### English vs. Non-English

In [None]:
lang_list = sorted(list(set(train_data["lang"])))
counts = [list(train_data["lang"]).count(cont) for cont in lang_list]
df = pd.DataFrame(np.transpose([lang_list, counts]))
df.columns = ["Language", "Count"]
df["Count"] = df["Count"].apply(int)

df_en = pd.DataFrame(np.transpose([["English", "Non-English"], [max(counts), sum(counts) - max(counts)]]))
df_en.columns = ["Language", "Count"]

fig = px.bar(df_en, x="Language", y="Count", title="Language of comments", color="Language", text="Count")
fig.update_layout(template="plotly_white")
fig.data[0].marker.line.color = 'rgb(0, 0, 0)'
fig.data[0].marker.line.width = 0.5
fig.data[1].marker.line.color = 'rgb(0, 0, 0)'
fig.data[1].marker.line.width = 0.5
fig.data[0].textfont.color = "black"
fig.data[0].textposition = "outside"
fig.data[1].textfont.color = "black"
fig.data[1].textposition = "outside"
fig

### Bar chart of non-English languages

In [None]:
fig = px.bar(df.query("Language != 'English' and Language != 'un'").query("Count >= 50"),
             y="Language", x="Count", title="Language of non-English comments", template="plotly_white", color="Language", text="Count", orientation="h")
fig.update_traces(marker=dict(line=dict(width=0.75,
                                        color='black')),  textposition="outside")
fig.update_layout(showlegend=False)
fig

### Pie chart of non-English languages

In [None]:
fig = go.Figure([go.Pie(labels=df.query("Language != 'English' and Language != 'un'").query("Count >= 50")["Language"],
           values=df.query("Language != 'English' and Language != 'un'").query("Count >= 50")["Count"])])
fig.update_layout(title_text="Pie chart of non-English languages", template="plotly_white")
fig.data[0].marker.colors = [px.colors.qualitative.Plotly[2:]]
fig.data[0].textfont.color = "black"
fig.data[0].textposition = "outside"
fig.show()

In [None]:
def get_country(language):
    if language == "German":
        return "Germany"
    if language == "Scots":
        return "Scotland"
    if language == "Danish":
        return "Denmark"
    if language == "Arabic":
        return "Saudi Arabia"
    if language == "Spanish":
        return "Spain"
    if language == "Persian":
        return "Iran"
    if language == "Greek":
        return "Greece"
    if language == "Portuguese":
        return "Portugal"
    if language == "English":
        return "United Kingdom"
    if language == "Hindi":
        return "India"
    if language == "Albanian":
        return "Albania"
    if language == "Bosnian":
        return "Bosnia and Herzegovina"
    if language == "Croatian":
        return "Croatia"
    if language == "Dutch":
        return "Netherlands"
    if language == "Russian":
        return "Russia"
    if language == "Vietnamese":
        return "Vietnam"
    if language == "Somali":
        return "Somalia"
    if language == "Turkish":
        return "Turkey"
    if language == "Serbian":
        return "Serbia"
    if language == "Indonesian":
        return "Indonesia"
    if language == "Manx":
        return "Ireland"
    if language == "Scots":
        return "Scotland"
    if language == "Latin":
        return "Holy See (Vatican City State)"
    if language == "Afrikaans":
        return "South Africa"
    return "None"
    
df["country"] = df["Language"].progress_apply(get_country)

### World plot of non-English languages

In [None]:
fig = px.choropleth(df.query("Language != 'English' and Language != 'un' and country != 'None'").query("Count >= 5"), locations="country", hover_name="country",
                     projection="natural earth", locationmode="country names", title="Countries of non-English languages", color="Count",
                     template="plotly", color_continuous_scale="agsunset")
# fig.data[0].marker.line.color = 'rgb(0, 0, 0)'
# fig.data[0].marker.line.width = 0.2
fig.show()

### Non-English European 

In [None]:
fig = px.choropleth(df.query("Language != 'English' and Language != 'un' and country != 'None'"), locations="country", hover_name="country",
                     projection="natural earth", locationmode="country names", title="Non-English European countries", color="Count",
                     template="plotly", color_continuous_scale="aggrnyl", scope="europe")
# fig.data[0].marker.line.color = 'rgb(0, 0, 0)'
# fig.data[0].marker.line.width = 0.2
fig.show()

In [None]:
fig = px.choropleth(df.query("Language != 'English' and Language != 'un' and country != 'None'"), locations="country", hover_name="country",
                     projection="natural earth", locationmode="country names", title="Asian countries", color="Count",
                     template="plotly", color_continuous_scale="spectral", scope="asia")
# fig.data[0].marker.line.color = 'rgb(0, 0, 0)'
# fig.data[0].marker.line.width = 0.2
fig.show()

In [None]:
fig = px.choropleth(df.query("Language != 'English' and Language != 'un' and country != 'None'").query("Count >= 5"), locations="country", hover_name="country",
                     projection="natural earth", locationmode="country names", title="African countries", color="Count",
                     template="plotly", color_continuous_scale="agsunset", scope="africa")
# fig.data[0].marker.line.color = 'rgb(0, 0, 0)'
# fig.data[0].marker.line.width = 0.2
fig.show()

### Distribution of comment words

In [None]:
def new_len(x):
    if type(x) is str:
        return len(x.split())
    else:
        return 0

train_data["comment_words"] = train_data["comment_text"].apply(new_len)
nums = train_data.query("comment_words != 0 and comment_words < 200").sample(frac=0.1)["comment_words"]
fig = ff.create_distplot(hist_data=[nums],
                         group_labels=["All comments"],
                         colors=["coral"])

fig.update_layout(title_text="Comment words", xaxis_title="Comment words", template="simple_white", showlegend=False)
fig.show()

### Average comment words vs. Language

In [None]:
df = pd.DataFrame(np.transpose([lang_list, train_data.groupby("lang").mean()["comment_words"]]))
df.columns = ["Language", "Average_comment_words"]
df["Average_comment_words"] = df["Average_comment_words"].apply(float)
df = df.query("Average_comment_words < 500")
fig = go.Figure(go.Bar(x=df["Language"], y=df["Average_comment_words"]))

fig.update_layout(xaxis_title="Language", yaxis_title="Average comment words", title_text="Average comment words vs. language", template="plotly_white")
fig.show()

### Average comment length vs. Country

In [None]:
df["country"] = df["Language"].apply(get_country)
df = df.query("country != 'None'")

fig = px.choropleth(df, locations="country", hover_name="country",
                     projection="natural earth", locationmode="country names", title="Average comment length vs. Country", color="Average_comment_words",
                     template="plotly", color_continuous_scale="aggrnyl")
fig

## Sentiment and polarity <a id="1.4"></a>



In [None]:
def polarity(x):
    if type(x) == str:
        return SIA.polarity_scores(x)
    else:
        return 1000
    
SIA = SentimentIntensityAnalyzer()
train_data["polarity"] = train_data["comment_text"].progress_apply(polarity)

In [None]:
fig = go.Figure(go.Histogram(x=[pols["neg"] for pols in train_data["polarity"] if pols["neg"] != 0], marker=dict(
            color='seagreen')
    ))

fig.update_layout(xaxis_title="Negativity sentiment", title_text="Negativity sentiment", template="simple_white")
fig.show()

### Negativity vs. Country

In [None]:
train_data["negativity"] = train_data["polarity"].apply(lambda x: x["neg"])
df = pd.DataFrame(np.transpose([lang_list, train_data.groupby("lang").mean()["negativity"].tolist()]))
df.columns = ["Language", "Negativity"]
df["Negativity"] = df["Negativity"].apply(float)
df = df.query("Negativity != 0")
df["country"] = df["Language"].apply(get_country)
df = df.query("country != 'None'")

fig = px.choropleth(df, locations="country", hover_name="country",
                    projection="natural earth", locationmode="country names", title="Average negative sentiment vs. Country", color="Negativity",
                    template="plotly", color_continuous_scale="greens")
fig.show()

### Negativity vs. Toxicity

In [None]:
nums_1 = train_data.sample(frac=0.1).query("toxic == 1")["negativity"]
nums_2 = train_data.sample(frac=0.1).query("toxic == 0")["negativity"]

fig = ff.create_distplot(hist_data=[nums_1, nums_2],
                         group_labels=["Toxic", "Non-toxic"],
                         colors=["darkorange", "dodgerblue"], show_hist=False)

fig.update_layout(title_text="Negativity vs. Toxicity", xaxis_title="Negativity", template="simple_white")
fig.show()

### Positive sentiment


In [None]:
fig = go.Figure(go.Histogram(x=[pols["pos"] for pols in train_data["polarity"] if pols["pos"] != 0], marker=dict(
            color='indianred')
    ))

fig.update_layout(xaxis_title="Positivity sentiment", title_text="Positivity sentiment", template="simple_white")
fig.show()

### Positivity vs. Country

In [None]:
train_data["positivity"] = train_data["polarity"].apply(lambda x: x["pos"])
df = pd.DataFrame(np.transpose([lang_list, train_data.groupby("lang").mean()["positivity"].tolist()]))
df.columns = ["Language", "Positivity"]
df["Positivity"] = df["Positivity"].apply(float)
df["country"] = df["Language"].apply(get_country)
df = df.query("country != 'None'")

fig = px.choropleth(df, locations="country", hover_name="country",
                    projection="natural earth", locationmode="country names", title="Average positive sentiment vs. Country", color="Positivity",
                    template="plotly", color_continuous_scale="reds")
fig.show()

### Positivity vs. Toxicity

In [None]:
nums_1 = train_data.sample(frac=0.1).query("toxic == 1")["positivity"]
nums_2 = train_data.sample(frac=0.1).query("toxic == 0")["positivity"]

fig = ff.create_distplot(hist_data=[nums_1, nums_2],
                         group_labels=["Toxic", "Non-toxic"],
                         colors=["darkorange", "dodgerblue"], show_hist=False)

fig.update_layout(title_text="Positivity vs. Toxicity", xaxis_title="Positivity", template="simple_white")
fig.show()

### Neutrality sentiment



In [None]:
fig = go.Figure(go.Histogram(x=[pols["neu"] for pols in train_data["polarity"] if pols["neu"] != 1], marker=dict(
            color='dodgerblue')
    ))

fig.update_layout(xaxis_title="Neutrality sentiment", title_text="Neutrality sentiment", template="simple_white")
fig.show()

### Neutrality vs. Country

In [None]:
train_data["neutrality"] = train_data["polarity"].apply(lambda x: x["neu"])
df = pd.DataFrame(np.transpose([lang_list, train_data.groupby("lang").mean()["neutrality"].tolist()]))
df.columns = ["Language", "Neutrality"]
df["Neutrality"] = df["Neutrality"].apply(float)
df = df.query("Neutrality != 1")
df["country"] = df["Language"].apply(get_country)
df = df.query("country != 'None'")

fig = px.choropleth(df, locations="country", hover_name="country",
                    projection="natural earth", locationmode="country names", title="Average neutral sentiment vs. Country", color="Neutrality",
                    template="plotly", color_continuous_scale="blues")
fig.show()

### Neutrality vs. Toxicity

In [None]:
nums_1 = train_data.sample(frac=0.1).query("toxic == 1")["neutrality"]
nums_2 = train_data.sample(frac=0.1).query("toxic == 0")["neutrality"]

fig = ff.create_distplot(hist_data=[nums_1, nums_2],
                         group_labels=["Toxic", "Non-toxic"],
                         colors=["darkorange", "dodgerblue"], show_hist=False)

fig.update_layout(title_text="Neutrality vs. Toxicity", xaxis_title="Neutrality", template="simple_white")
fig.show()

### Compound sentiment


In [None]:
fig = go.Figure(go.Histogram(x=[pols["compound"] for pols in train_data["polarity"] if pols["compound"] != 0], marker=dict(
            color='orchid')
    ))

fig.update_layout(xaxis_title="Compound sentiment", title_text="Compound sentiment", template="simple_white")
fig.show()

### Average compound sentiment vs. Country

In [None]:
train_data["compound"] = train_data["polarity"].apply(lambda x: x["compound"])
df = pd.DataFrame(np.transpose([lang_list, train_data.groupby("lang").mean()["compound"].tolist()]))
df.columns = ["Language", "Compound"]
df["Compound"] = df["Compound"].apply(float)
df = df.query("Compound != 0")
df["country"] = df["Language"].apply(get_country)
df = df.query("country != 'None'")

fig = px.choropleth(df, locations="country", hover_name="country",
                    projection="natural earth", locationmode="country names", title="Average compound sentiment vs. Country", color="Compound",
                    template="plotly", color_continuous_scale="purples")
fig.show()

### Compound sentiment vs. Toxicity

In [None]:
nums_1 = train_data.sample(frac=0.1).query("toxic == 1")["compound"]
nums_2 = train_data.sample(frac=0.1).query("toxic == 0")["compound"]

fig = ff.create_distplot(hist_data=[nums_1, nums_2],
                         group_labels=["Toxic", "Non-toxic"],
                         colors=["darkorange", "dodgerblue"], show_hist=False)

fig.update_layout(title_text="Compound vs. Toxicity", xaxis_title="Compound", template="simple_white")
fig.show()

## Readability <a id="1.5"></a>


In [None]:
train_data["flesch_reading_ease"] = train_data["comment_text"].progress_apply(textstat.flesch_reading_ease)
train_data["automated_readability"] = train_data["comment_text"].progress_apply(textstat.automated_readability_index)
train_data["dale_chall_readability"] = train_data["comment_text"].progress_apply(textstat.dale_chall_readability_score)

### Distribution of Flesch reading ease

In [None]:
fig = go.Figure(go.Histogram(x=train_data.query("flesch_reading_ease > 0")["flesch_reading_ease"], marker=dict(
            color='darkorange')
    ))

fig.update_layout(xaxis_title="Flesch reading ease", title_text="Flesch reading ease", template="simple_white")
fig.show()

### Flesch reading ease vs. Country

In [None]:
df = pd.DataFrame(np.transpose([lang_list, train_data.groupby("lang").mean()["flesch_reading_ease"].tolist()]))
df.columns = ["Language", "flesch_reading_ease"]
df["flesch_reading_ease"] = df["flesch_reading_ease"].apply(float)
df = df.query("flesch_reading_ease > 0")
df["country"] = df["Language"].apply(get_country)
df = df.query("country != 'None'")

fig = px.choropleth(df, locations="country", hover_name="country",
                    projection="natural earth", locationmode="country names", title="Average Flesch reading ease vs. Country", color="flesch_reading_ease",
                    template="plotly", color_continuous_scale="oranges")
fig.show()

### Flesch reading ease vs. Toxicity

In [None]:
nums_1 = train_data.sample(frac=0.1).query("toxic == 1")["flesch_reading_ease"]
nums_2 = train_data.sample(frac=0.1).query("toxic == 0")["flesch_reading_ease"]

fig = ff.create_distplot(hist_data=[nums_1, nums_2],
                         group_labels=["Toxic", "Non-toxic"],
                         colors=["darkorange", "dodgerblue"], show_hist=False)

fig.update_layout(title_text="Flesch reading ease vs. Toxicity", xaxis_title="Flesch reading ease", template="simple_white")
fig.show()

### Distribution of automated readability

In [None]:
fig = go.Figure(go.Histogram(x=train_data.query("automated_readability < 100")["automated_readability"], marker=dict(
            color='mediumaquamarine')
    ))

fig.update_layout(xaxis_title="Automated readability", title_text="Automated readability", template="simple_white")
fig.show()

### Automated readability vs. Country

In [None]:
df = pd.DataFrame(np.transpose([lang_list, train_data.groupby("lang").mean()["automated_readability"].tolist()]))
df.columns = ["Language", "automated_readability"]
df["automated_readability"] = df["automated_readability"].apply(float)
df = df.query("automated_readability < 100")
df["country"] = df["Language"].apply(get_country)
df = df.query("country != 'None'")

fig = px.choropleth(df, locations="country", hover_name="country",
                    projection="natural earth", locationmode="country names", title="Automated readability vs. Country", color="automated_readability",
                    template="plotly", color_continuous_scale="GnBu")
fig.show()

### Automated readability vs. Toxicity

In [None]:
nums_1 = train_data.sample(frac=0.1).query("toxic == 1")["automated_readability"]
nums_2 = train_data.sample(frac=0.1).query("toxic == 0")["automated_readability"]

fig = ff.create_distplot(hist_data=[nums_1, nums_2],
                         group_labels=["Toxic", "Non-toxic"],
                         colors=["darkorange", "dodgerblue"], show_hist=False)

fig.update_layout(title_text="Automated readability vs. Toxicity", xaxis_title="Automated readability", template="simple_white")
fig.show()

### Distribution of Dale-Chall readability

In [None]:
fig = go.Figure(go.Histogram(x=train_data.query("dale_chall_readability < 20")["dale_chall_readability"], marker=dict(
            color='deeppink')
    ))

fig.update_layout(xaxis_title="Dale-Chall readability", title_text="Dale-Chall readability", template="simple_white")
fig.show()

### Dale-Chall readability vs. Country

In [None]:
df = pd.DataFrame(np.transpose([lang_list, train_data.groupby("lang").mean()["dale_chall_readability"].tolist()]))
df.columns = ["Language", "dale_chall_readability"]
df["dale_chall_readability"] = df["dale_chall_readability"].apply(float)
df = df.query("dale_chall_readability < 20")
df["country"] = df["Language"].apply(get_country)
df = df.query("country != 'None'")

fig = px.choropleth(df, locations="country", hover_name="country",
                    projection="natural earth", locationmode="country names", title="Dale-Chall readability vs. Country", color="dale_chall_readability",
                    template="plotly", color_continuous_scale="PuRd")
fig.show()

### Dale-Chall readability

In [None]:
nums_1 = train_data.sample(frac=0.1).query("toxic == 1")["dale_chall_readability"]
nums_2 = train_data.sample(frac=0.1).query("toxic == 0")["dale_chall_readability"]

fig = ff.create_distplot(hist_data=[nums_1, nums_2],
                         group_labels=["Toxic", "Non-toxic"],
                         colors=["darkorange", "dodgerblue"], show_hist=False)

fig.update_layout(title_text="Dale-Chall readability vs. Toxicity", xaxis_title="Dale-Chall readability", template="simple_white")
fig.show()

## Targets 

### Wordclouds for different categories

### Non-toxic vs. Toxic

In [None]:
clean_mask=np.array(Image.open("../input/imagesforkernal/safe-zone.png"))
clean_mask=clean_mask[:,:,1]

subset = train_data.query("toxic == 0")
text = subset.comment_text.values
wc = WordCloud(background_color="black",max_words=2000,mask=clean_mask,stopwords=stopword)
wc.generate(" ".join(text))
plt.figure(figsize=(7.5, 7.5))
plt.axis("off")
plt.title("Words frequented in Clean Comments", fontsize=16)
plt.imshow(wc.recolor(colormap= 'viridis' , random_state=17), alpha=0.98)
plt.show()

clean_mask=np.array(Image.open("../input/imagesforkernal/swords.png"))
clean_mask=clean_mask[:,:,1]

subset = train_data.query("toxic == 1")
text = subset.comment_text.values
wc = WordCloud(background_color="black",max_words=2000,mask=clean_mask,stopwords=stopword)
wc.generate(" ".join(text))
plt.figure(figsize=(7.5, 7.5))
plt.axis("off")
plt.title("Words frequented in Toxic Comments", fontsize=16)
plt.imshow(wc.recolor(colormap= 'viridis' , random_state=17), alpha=0.98)
plt.show()

### Obscene vs. Severe Toxic vs. Threat vs. Insult

In [None]:
toxic_mask=np.array(Image.open("../input/imagesforkernal/toxic-sign.png"))
toxic_mask=toxic_mask[:,:,1]
#wordcloud for clean comments
subset=train_data.query("obscene == 1")
text=subset.comment_text.values
wc= WordCloud(background_color="black",max_words=4000,mask=toxic_mask,stopwords=stopword)
wc.generate(" ".join(text))
plt.figure(figsize=(20,20))
plt.subplot(221)
plt.axis("off")
plt.title("Words frequented in Obscene Comments", fontsize=20)
plt.imshow(wc.recolor(colormap= 'gist_earth' , random_state=244), alpha=0.98)

#Severely toxic comments
plt.subplot(222)
severe_toxic_mask=np.array(Image.open("../input/imagesforkernal/bomb.png"))
severe_toxic_mask=severe_toxic_mask[:,:,1]
subset=train_data[train_data.severe_toxic==1]
text=subset.comment_text.values
wc= WordCloud(background_color="black",max_words=2000,mask=severe_toxic_mask,stopwords=stopword)
wc.generate(" ".join(text))
plt.axis("off")
plt.title("Words frequented in Severe Toxic Comments", fontsize=20)
plt.imshow(wc.recolor(colormap= 'Reds' , random_state=244), alpha=0.98)

#Threat comments
plt.subplot(223)
threat_mask=np.array(Image.open("../input/imagesforkernal/anger.png"))
threat_mask=threat_mask[:,:,1]
subset=train_data[train_data.threat==1]
text=subset.comment_text.values
wc= WordCloud(background_color="black",max_words=2000,mask=threat_mask,stopwords=stopword)
wc.generate(" ".join(text))
plt.axis("off")
plt.title("Words frequented in Threatening Comments", fontsize=20)
plt.imshow(wc.recolor(colormap= 'summer' , random_state=2534), alpha=0.98)

#insult
plt.subplot(224)
insult_mask=np.array(Image.open("../input/imagesforkernal/swords.png"))
insult_mask=insult_mask[:,:,1]
subset=train_data[train_data.insult==1]
text=subset.comment_text.values
wc= WordCloud(background_color="black",max_words=2000,mask=insult_mask,stopwords=stopword)
wc.generate(" ".join(text))
plt.axis("off")
plt.title("Words frequented in insult Comments", fontsize=20)
plt.imshow(wc.recolor(colormap= 'Paired_r' , random_state=244), alpha=0.98)

plt.show()

### Pie chart of targets

In [None]:
fig = go.Figure(data=[
    go.Pie(labels=train_data.columns[2:7],
           values=train_data.iloc[:, 2:7].sum().values, marker=dict(colors=px.colors.qualitative.Plotly))
])
fig.update_traces(textposition='outside', textfont=dict(color="black"))
fig.update_layout(title_text="Pie chart of labels")
fig.show()

### Bar chart of targets

In [None]:
fig = go.Figure(data=[
    go.Bar(y=train_data.columns[2:7],
           x=train_data.iloc[:, 2:7].sum().values, marker=dict(color=px.colors.qualitative.Plotly))
])

fig.data[0].marker.line.color = 'rgb(0, 0, 0)'
fig.data[0].marker.line.width = 0.75
fig.update_traces(orientation="h")
fig.update_layout(title_text="Bar chart of labels", template="plotly_white")
fig.show()

### Toxicity vs. Country

In [None]:
df = pd.DataFrame(np.transpose([lang_list, train_data.groupby("lang").mean()["toxic"].tolist()]))
df.columns = ["Language", "toxicity"]
df["toxicity"] = df["toxicity"].apply(float)
df["country"] = df["Language"].apply(get_country)
df = df.query("country != 'None'")

fig = px.choropleth(df, locations="country", hover_name="country",
                    projection="natural earth", locationmode="country names", title="Average toxicity vs. Country", color="toxicity",
                    template="plotly", color_continuous_scale="tealrose")
fig.show()