In [1]:
pip install texthero==1.0.5

In [2]:
pip install spacy 1.3.1


In [3]:
pip install transformers==2.11.0

In [4]:
pip install spacy-transformers[cuda100]==0.6.2

In [5]:
pip install -U textblob

In [6]:
pip install transformers

In [7]:
pip install spacy 1.3.1

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline


import re
import string
from wordcloud import WordCloud

from textblob import TextBlob

import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import word2vec

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB


import joblib

In [2]:
df = pd.read_csv("../input/cyberbullying-dataset/twitter_parsed_dataset.csv")
df.head(20)

# **#Exploratory Data Analysis** (Sentence Level Analysis)

In [3]:
df.info()

In [4]:
df.shape

In [5]:
df.isnull().sum() 

In [6]:
df.dropna(inplace=True)
df.isnull().sum()

In [7]:
df['oh_label']=df['oh_label'].astype(int)

In [8]:
df.drop(['index','id','Annotation'], axis = 1,inplace=True)

In [9]:
df.head()

# ### First, I’ll take a look at the number of characters present in each sentence. This can give us a rough idea about the text length.

In [10]:
df['Text'].str.len().hist()

### To check the sentence length distribution.

In [11]:
df['len']= df['Text'].str.len()
print('Max length: {}, Min length: {}, Average Length :  {}'.format(max(df['len']),min(df['len']),df['len'].mean()))
df['len'].hist()

#### Word level Analysis

In [12]:
text = ','.join([str(i) for i in df['Text']])
words_list= text.split()

In [13]:
word_freq= {}
for word in set(words_list):
    word_freq[word]= words_list.count(word)
#Creating dataframe of words
df_word= pd.DataFrame(word_freq.items(),columns=['word','count'])

In [14]:
df_word['word_len']= df_word['word'].map(lambda x: len(x))
# sorting values 
df_word=df_word.sort_values('count',ascending=False).reset_index(drop=True)
df_word

In [15]:
df_top= df_word.head(50)
sns.barplot(df_top['count'],df_top['word'])

In [16]:
df_word['word_len'].hist()

In [17]:
df['Text'].sample(1).values[0]

In [18]:
df['oh_label'].value_counts().plot(kind='bar', color=sns.color_palette('pastel'))
plt.xticks([0,1],['toxic', 'non-toxic'], rotation=0);

#### ###Analyzing the amount and the types of stopwords can give us some good insights into the data.

#### We will use the counter function from the collections library to count and store the occurrences of each word in a list of tuples. This is a very useful function when we deal with word-level analysis in natural language processing.

### Data Cleaning

In [19]:
for index,text in enumerate(df['Text'][ :100]):
  print('Review %d:\n'%(index+1),text)


#### Here, you can see that we have some contractions like “It’s”, numbers like “3” and punctuations like “,”, “!” and “.” present in the reviews. We’ll handle these by performing the below operations:

1) Expand contractions
2) Lowercase the reviews
3) Remove digits and words containing digits
4) Remove punctuations

In [20]:
df['cleaned']=df['Text'].apply(lambda x: x.lower())

In [21]:
df['cleaned']=df['cleaned'].apply(lambda x: re.sub('\w*\d\w*','', x))

In [22]:

df['cleaned']=df['cleaned'].apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), '', x))

In [23]:

# Removing extra spaces
df['cleaned']=df['cleaned'].apply(lambda x: re.sub(' +',' ',x))

In [24]:
                                                      
df['cleaned']=df['cleaned'].apply(lambda x:re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                                "]+",flags=re.UNICODE).sub(r'', x)
                                  
)

In [25]:
df['corrected_text'] = 'cleaned'
df

In [26]:
for index,corrected_text in enumerate(df['cleaned'][:100]):
  print('Review %d:\n'%(index+1),corrected_text)

# ####We’ll use SpaCy for the removal of stopwords and lemmatization. It is a library for advanced Natural Language Processing in Python

In [27]:
# Importing spacy
import spacy

# Loading model
nlp = spacy.load('en_core_web_sm',disable=['parser', 'ner'])

# Lemmatization with stopwords removal
df['lemmatized']=df['cleaned'].apply(lambda x: ' '.join([token.lemma_ for token in list(nlp(x)) if (token.is_stop==False)]))


In [28]:
df_grouped=df[['Text','lemmatized']].groupby(by='Text').agg(lambda x:' '.join(x))
df_grouped.head()

In [29]:
# Creating Document Term Matrix
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(analyzer='word')
data=cv.fit_transform(df_grouped['lemmatized'])
df_dtm = pd.DataFrame(data.toarray(), columns=cv.get_feature_names())
df_dtm.index=df_grouped.index
df_dtm.head(3)

#### So, let’s start by looking at the common words present in the reviews for each product. For this, I will use the document term matrix created earlier with word clouds for plotting these words. Word clouds are the visual representations of the frequency of different words present in a document.

In [30]:
import nltk
nltk.download('stopwords')
stop=set(stopwords.words('english'))

In [31]:
corpus=[]
new= df['Text'].str.split()
new=new.values.tolist()
corpus=[word for i in new for word in i]

from collections import defaultdict
dic=defaultdict(int)
for word in corpus:
    if word in stop:
        dic[word]+=1

In [32]:

            
    top=sorted(dic.items(), key=lambda x:x[1],reverse=True)[:10] 
    x,y=zip(*top)
    plt.bar(x,y)

In [33]:
### Plot top non stopwords
from collections import  Counter
counter=Counter(corpus)
most=counter.most_common()

x, y= [], []
for word,count in most[:40]:
    if (word not in stop):
        x.append(word)
        y.append(count)
        
sns.barplot(x=y,y=x)


# ##Wordcloud
Wordcloud is a great way to represent text data. The size and color of each word that appears in the wordcloud indicate it’s frequency or importance.

In [34]:
from wordcloud import WordCloud, STOPWORDS
stopwords = set(STOPWORDS)

def show_wordcloud(data):
    wordcloud = WordCloud(
        background_color='white',
        stopwords=stopwords,
        max_words=100,
        max_font_size=30,
        scale=3,
        random_state=1)
   
    wordcloud=wordcloud.generate(str(data))

    fig = plt.figure(1, figsize=(12, 12))
    plt.axis('off')

    plt.imshow(wordcloud)
    plt.show()

show_wordcloud(corpus)

#### Textblob
Textblob is a python library built on top of nltk. It has been around for some time and is very easy and convenient to use.

The sentiment function of TextBlob returns two properties:

polarity: is a floating-point number that lies in the range of [-1,1] where 1 means positive statement and -1 means a negative statement.
subjectivity: refers to how someone’s judgment is shaped by personal opinions and feelings. Subjectivity is represented as a floating-point value which lies in the range of [0,1].

In [35]:
from textblob import TextBlob

In [36]:
TextBlob(df['Text'][8]).sentiment

In [37]:
def polarity(text):
    return TextBlob(text).sentiment.polarity

df['polarity_score']=df['Text'].\
   apply(lambda x : polarity(x))
df['polarity_score'].hist()

In [38]:
def sentiment(x):
    if x<0:
        return 'toxic'
    elif x==0:
        return 'neutral'
    else:
        return 'non-toxic'
    
df['polarity']=df['polarity_score'].\
   map(lambda x: sentiment(x))

plt.bar(df.polarity.value_counts().index,
        df.polarity.value_counts())

#### Let’s take a look at some of the toxic and non-toxic tweets

In [39]:
df[df['polarity']=='toxic']['cleaned'].head()

In [40]:
df[df['polarity']=='non-toxic']['cleaned'].head()

In [41]:
df

#### Feature Extraction
Some features will be extracted after text cleaning because they are more meaningful to obtain at this step

In [42]:
df['word_count'] = df['cleaned'].apply(lambda x: len(str(x).split(" ")))
df[['cleaned','word_count']].head()

In [43]:
df['char_count'] = df['cleaned'].str.len() ## this also includes spaces
df[['cleaned','char_count']].head()

In [44]:
def avg_word(sentence):
    words = sentence.split()
    return (sum(len(word) for word in words)/(len(words)+0.000001))
df['avg_word'] = df['cleaned'].apply(lambda x: avg_word(x)).round(1)
df[['cleaned','avg_word']].head()

In [45]:
df.sample(5)

In [None]:
#### Modelling

In [46]:
final_df = df.drop(columns=['Text', 'corrected_text', 'len', 'lemmatized','polarity_score','polarity','word_count','char_count','avg_word'])

In [47]:
final_df

In [48]:
X = final_df['cleaned']
y = final_df['oh_label']

In [49]:
X

In [50]:
y

In [51]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report

In [52]:
cv = CountVectorizer()
X = cv.fit_transform(X) # Fit the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [53]:
#Naive Bayes Classifier
clf = MultinomialNB()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

In [54]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

# ### Save the Model For Deployment

In [55]:
model = MultinomialNB()


In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [57]:
# fitting model
model.fit(X_train, y_train)

In [58]:
# predicting
y_pred = model.predict(X_test)

In [59]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

In [60]:
score = accuracy_score(y_test, y_pred)
print("Accuracy: ", score)

In [61]:
import pickle 
pickle_out = open("model.pkl", mode = "wb") 
pickle.dump(model, pickle_out) 
pickle_out.close()

# **#### Explainable AI **

In [62]:
import shap

In [65]:
import transformers

In [67]:
short_data = [v[:500] for v in df["cleaned"][:20]]

In [68]:
classifier = transformers.pipeline('sentiment-analysis', return_all_scores=True)
classifier(short_data[:2])

In [69]:
explainer = shap.Explainer(classifier)

In [70]:
# explain the predictions of the pipeline on the first two samples
shap_values = explainer(short_data[:2])

In [71]:
shap.plots.text(shap_values[:,:,"POSITIVE"])

In [72]:
shap.plots.text(shap_values[:,:,"NEGATIVE"])