In [None]:
import sklearn
import nltk 

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

nltk.download('stopwords')

In [None]:
df = pd.read_csv('../data/cyberbullying_tweets.csv')

## Data Exploration and initial cleaning

In [None]:
df.describe()

In [None]:
# Take a look at a random sample of the dataset
df.sample(frac=1).head(10)

In [None]:
# Remove null values from both columns
df = df[~df.tweet_text.isna()]
df = df[~df.cyberbullying_type.isna()]

In [None]:
# Check distribution of labels
df.cyberbullying_type.value_counts()

In [None]:
# To keep tabs on how many rows we delete during cleaning
initial_length = len(df)

In [None]:
def plot_histogram(df, column, n_bins):
    fig, ax = plt.subplots()
    hist, bin_edges = np.histogram(list(df[column].str.len()), bins=int(len(list(df[column].str.len()))/n_bins))
    ax.bar(bin_edges[:-1], hist, width=np.diff(bin_edges), edgecolor="black", align="edge")
    plt.title("Histogram showing binned length of tweets")
    plt.ylabel('Frequency')
    plt.xlabel('Length of tweet')
    plt.show()
    return hist, bin_edges

In [None]:
hist_values, bins = plot_histogram(df, 'tweet_text', 2000)

### The histogram shows an uneven distribution of data, showing outliers with large string length
Let's clean up the outliers

In [None]:
hist_values, bins

In [None]:
# Remove tweets with more than 350 characters
df = df[df.tweet_text.str.len()< 300]

# Remove tweets with less than 30 chars as the text might be too short to make a prediction on
# df = df[~df.tweet_text.str.len()< 30]

In [None]:
hist_values, bins = plot_histogram(df, 'tweet_text', 2000)

In [None]:
# Remove @tags as they are not important for prediction
df.tweet_text = df.tweet_text.apply(lambda x: ' '.join(word for word in x.split(' ') if not word.startswith('@')))

# Remove label which is not focused on any one type of context
df = df[df.cyberbullying_type != 'other_cyberbullying']

# Remove duplicated rows
df = df[~df.tweet_text.duplicated()]

# Operations like removing @tags can leave us with empty strings
df = df[~(df.tweet_text == '')]

In [None]:
print(f"We have deleted {(initial_length - len(df))/initial_length*100}% of the initial dataset")

In [None]:
# Take a look at a random sample of the dataset
df.sample(frac=1).head(10)

In [None]:
# Have a look at some random tweets
i=np.random.randint(len(df))
print(i)
df.iloc[i].tweet_text, df.iloc[i].cyberbullying_type

In [None]:
df.to_csv('../data/cyberbullying_tweets_clean.csv', index=False)

### Example of issues Issues with the dataset (iloc, reason)
21058: Not religion

9002: News tweet

### Possible Feature Engineering
- Check for caps letters (angry messages)
- Isolate cuss words

In [None]:
from nltk.corpus import stopwords

custom_stopwords = ['u', 'ur', 'i\'m']
for custom_word in custom_stopwords:
    stopwords.words('english').append(custom_word)

In [None]:
df.tweet_text = df.tweet_text.apply(lambda x: ' '.join([word for word in x.lower().split(' ')\
                                                        if word not in stopwords.words('english')]))

In [None]:
from collections import Counter
from wordcloud import WordCloud


for label in df.cyberbullying_type.unique():
    print("Word cloud for", label)
    # Get most commonly used words
    common_words = Counter(" ".join(df[df.cyberbullying_type == label].tweet_text).split()).most_common(10)
    for item in common_words:
        print(item)
    
    word_cloud = WordCloud(collocations = False, background_color = 'white')\
    .generate(' '.join(list(df[df.cyberbullying_type == label].tweet_text)))
    # Display the generated Word Cloud
    plt.imshow(word_cloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()
    print("\n\n")
    