# Visualizating text data using scattertext
In this notebook I have used formspring data (https://www.kaggle.com/swetaagrawal/formspring-data-for-cyberbullying-detection) in order to visualize the text in a way, very informative and unique compared to conventional methods. Tuorials and installation instructions for scattertext library can be accessed from the authors github page: https://github.com/JasonKessler/scattertext

In [1]:
%matplotlib inline
import pickle
import scattertext as st
import re, io
from pprint import pprint
import pandas as pd
import numpy as np
from scipy.stats import rankdata, hmean, norm
import spacy
#spacy.load('en')
import os, pkgutil, json, urllib
from urllib.request import urlopen
from IPython.display import IFrame
from IPython.core.display import display, HTML
from scattertext import CorpusFromPandas, produce_scattertext_explorer
display(HTML("<style>.container { width:98% !important; }</style>"))

In [2]:
#The formspring dataset was cleaned and labeling was converted to binary form using 
#extract.py script and loaded into pickle files
words_file = "my_dataset.pkl"
labels_file = "my_feature_list.pkl"
word_data = pickle.load( open(words_file, "rb"))
labels_data = pickle.load( open(labels_file, "rb") )
#Here we simply want to change the labels from (0,1) to (bully, notBully)
new_label = []
for item in labels_data:
    if item == "0":
        new_label.append("notBully")
    elif item == "1":
        new_label.append("Bully")
    else:
        print("data not readable")
        exit(1)

In [3]:
bully_df= pd.DataFrame({'text': word_data,'flag': new_label})

In [4]:
## Scattertext may choke if you have a category with very few words in it.
## We want to make sure we have at least 10 words in each comment
bully_df["count"] = bully_df['text'].str.split().str.len()
bully_df_good = bully_df.loc[bully_df['count'] > 10]
print(bully_df_good.head())
bully_df_good.shape

       flag                                               text  count
0  notBully   what&#039;s your favorite song? :D I like too...     29
1  notBully   <3 </3 ? haha jk! <33\t<3\t </3 ? haha jk! <3...     13
2  notBully   &quot;hey angel  you duh sexy&quot; Really?!?...     17
5  notBully   any makeup tips? i suck at doing my makeup lo...     47
6  notBully   Apriiiiiiiiiiiill!!! I miss uuuu! It&#039;s E...     33


(12256, 3)

In [5]:
# Now we let the scattertest handle the punctuation. I noticed that feeding
# comments that are already teated for punctuation somehow results in error.
nlp = st.whitespace_nlp_with_sentences
corpus = st.CorpusFromPandas(bully_df_good, category_col='flag', text_col='text', nlp=nlp).build()

In [6]:
# Here are the top 10 words that are most representative of Bully like comments
term_freq_df = corpus.get_term_freq_df()
term_freq_df['Bully Score'] = corpus.get_scaled_f_scores('Bully')
pprint(list(term_freq_df.sort_values(by='Bully Score', ascending=False).index[:10]))

['buffalo',
 'buffalo buffalo',
 'bitch',
 'stfu',
 'hoe',
 'talk shit',
 'fuck off',
 'fat',
 'bxtch',
 'isn 39']


In [7]:
# Here are the top 10 words that are most representative of nonBully like comments
term_freq_df = corpus.get_term_freq_df()
term_freq_df['nonBully Score'] = corpus.get_scaled_f_scores('notBully')
pprint(list(term_freq_df.sort_values(by='nonBully Score', ascending=False).index[:10]))

['first',
 's your',
 'family',
 'movie',
 'they are',
 'new',
 'money',
 'it is',
 'to have',
 'home']


In [8]:
# Let us now create a scatter plot that will the bully and nonBully words with X and Y axes
# representing the word frequencies. This code will produce a standalone HTML file, which will
# take a while to load and works fine with my chrome browser. I have also included a png image 
# in this folder for a quick view of how it looks.
html = st.produce_scattertext_explorer(corpus,category='Bully',category_name='Bully Words',not_category_name='notBully',
                                       width_in_pixels=1000)
open("bully.html", 'wb').write(html.encode('utf-8'))

4771248