In [1]:
import sys  
!{sys.executable} -m pip install contractions



In [2]:
import pandas as pd
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import numpy as np
%matplotlib inline

init_notebook_mode(connected=True)
import cufflinks as cf
cf.go_offline()

# Importing the dataset
dataset = pd.read_csv('train.csv', delimiter = ',',usecols=["text", "offense_rating"])

print(dataset)

                                                   text  offense_rating
0     TENNESSEE: We're the best state. Nobody even c...            0.20
1     A man inserted an advertisement in the classif...            1.10
2     How many men does it take to open a can of bee...            2.40
3     Told my mom I hit 1200 Twitter followers. She ...            0.00
4     Roses are dead. Love is fake. Weddings are bas...            0.10
...                                                 ...             ...
7995  Lack of awareness of the pervasiveness of raci...            0.25
7996    Why are aspirins white? Because they work sorry            3.85
7997  Today, we Americans celebrate our independence...            0.00
7998  How to keep the flies off the bride at an Ital...            3.00
7999  "Each ounce of sunflower seeds gives you 37% o...            0.00

[8000 rows x 2 columns]


In [3]:
print(len(dataset[dataset['offense_rating'] == '']))

0


There are no cells without an offense_rating value

In [4]:
dataset['offense_rating'].iplot(
    kind='hist',
    bins=50,
    xTitle='Offense Rating',
    linecolor='black',
    yTitle='count',
    title='Offense Rating Distribution')

### The offense rating mean:

In [5]:
mean = dataset['offense_rating'].mean()
print("Mean value for offense rating:", mean)

Mean value for offense rating: 0.5853250000000031


### The offense rating median:

In [6]:
median = dataset['offense_rating'].median()
print("Median value for offense rating:", median)

Median value for offense rating: 0.1


### The offense rating mode:

In [7]:
modeOffense = dataset['offense_rating'].mode()
print("Mode value for offense rating:", modeOffense)

Mode value for offense rating: 0    0.0
dtype: float64


### The offense rating variance:

In [8]:
variance = dataset['offense_rating'].var()
print("Variance value for offense rating:", variance)

Variance value for offense rating: 0.960312183397919


### The offense rating standard deviation:

In [9]:
std = dataset['offense_rating'].std()
print("Standard Deviation value for offense rating:", std)

Standard Deviation value for offense rating: 0.9799551945869357


### The offense rating skewness:

In [10]:
skewness = dataset['offense_rating'].skew()
print("Skewness value for offense rating:", skewness)

Skewness value for offense rating: 2.025170682712623


### Processing the data

#### To help normalize the text, it should all be converterd to lower case.

In [11]:
processed_data = dataset.copy()
processed_data["text"] = processed_data['text'].str.lower()
print(dataset)
print(processed_data)

                                                   text  offense_rating
0     TENNESSEE: We're the best state. Nobody even c...            0.20
1     A man inserted an advertisement in the classif...            1.10
2     How many men does it take to open a can of bee...            2.40
3     Told my mom I hit 1200 Twitter followers. She ...            0.00
4     Roses are dead. Love is fake. Weddings are bas...            0.10
...                                                 ...             ...
7995  Lack of awareness of the pervasiveness of raci...            0.25
7996    Why are aspirins white? Because they work sorry            3.85
7997  Today, we Americans celebrate our independence...            0.00
7998  How to keep the flies off the bride at an Ital...            3.00
7999  "Each ounce of sunflower seeds gives you 37% o...            0.00

[8000 rows x 2 columns]
                                                   text  offense_rating
0     tennessee: we're the best state. 

#### Then, the contractions should be removed, since it doesn't add meaning to sentences:

In [12]:
import contractions

def contract(text, remove_stopwords = True):
    expanded_words = []    

    for word in text.split():
      # using contractions.fix to expand the shotened words
      expanded_words.append(contractions.fix(word))   
    
    expanded_text = ' '.join(expanded_words)
    return expanded_text

processed_data["text"] = list(map(contract, processed_data.text))
print(processed_data)


                                                   text  offense_rating
0     tennessee: we are the best state. nobody even ...            0.20
1     a man inserted an advertisement in the classif...            1.10
2     how many men does it take to open a can of bee...            2.40
3     told my mom i hit 1200 twitter followers. she ...            0.00
4     roses are dead. love is fake. weddings are bas...            0.10
...                                                 ...             ...
7995  lack of awareness of the pervasiveness of raci...            0.25
7996    why are aspirins white? because they work sorry            3.85
7997  today, we americans celebrate our independence...            0.00
7998  how to keep the flies off the bride at an ital...            3.00
7999  "each ounce of sunflower seeds gives you 37% o...            0.00

[8000 rows x 2 columns]


#### Removing punctuation:

In [13]:
import re

def remove_punctuation(text):  
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text) 
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)
    return text

processed_data["text"] = list(map(remove_punctuation, processed_data.text))

print(processed_data)

                                                   text  offense_rating
0     tennessee  we are the best state  nobody even ...            0.20
1     a man inserted an advertisement in the classif...            1.10
2     how many men does it take to open a can of bee...            2.40
3     told my mom i hit 1200 twitter followers  she ...            0.00
4     roses are dead  love is fake  weddings are bas...            0.10
...                                                 ...             ...
7995  lack of awareness of the pervasiveness of raci...            0.25
7996    why are aspirins white  because they work sorry            3.85
7997  today  we americans celebrate our independence...            0.00
7998  how to keep the flies off the bride at an ital...            3.00
7999   each ounce of sunflower seeds gives you 37  o...            0.00

[8000 rows x 2 columns]


#### Removing stopwords

In [14]:
from nltk.corpus import stopwords

def remove_stopwords(text):  
    text = text.split()
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]
    text = " ".join(text)
    return text

processed_data["text"] = list(map(remove_stopwords, processed_data.text))
print(processed_data)

                                                   text  offense_rating
0     tennessee best state nobody even comes close e...            0.20
1     man inserted advertisement classifieds wife wa...            1.10
2     many men take open beer none open time brings ...            2.40
3     told mom hit 1200 twitter followers pointed br...            0.00
4     roses dead love fake weddings basically funera...            0.10
...                                                 ...             ...
7995  lack awareness pervasiveness racism society pr...            0.25
7996                          aspirins white work sorry            3.85
7997  today americans celebrate independence britain...            0.00
7998  keep flies bride italian wedding keep bucket s...            3.00
7999  ounce sunflower seeds gives 37 daily need vita...            0.00

[8000 rows x 2 columns]


#### Tokenizing text

In [15]:
import nltk

def tokenize(text):
    text =  nltk.WordPunctTokenizer().tokenize(text)
    return text

processed_data["text"] = list(map(tokenize, processed_data.text))

print(processed_data)

                                                   text  offense_rating
0     [tennessee, best, state, nobody, even, comes, ...            0.20
1     [man, inserted, advertisement, classifieds, wi...            1.10
2     [many, men, take, open, beer, none, open, time...            2.40
3     [told, mom, hit, 1200, twitter, followers, poi...            0.00
4     [roses, dead, love, fake, weddings, basically,...            0.10
...                                                 ...             ...
7995  [lack, awareness, pervasiveness, racism, socie...            0.25
7996                     [aspirins, white, work, sorry]            3.85
7997  [today, americans, celebrate, independence, br...            0.00
7998  [keep, flies, bride, italian, wedding, keep, b...            3.00
7999  [ounce, sunflower, seeds, gives, 37, daily, ne...            0.00

[8000 rows x 2 columns]


### Splitting data into train and test data

In [16]:
from sklearn.model_selection import train_test_split
percentage = 0.8
data_train, data_test = train_test_split(processed_data, train_size = percentage)

data_train_lem = data_train.copy()
data_test_lem = data_test.copy()

data_train_stem = data_train.copy()
data_test_stem = data_test.copy()

#### Lemmatizing:

In [17]:
# Lemmatizing

import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

def lemmatized_words(dataset):
    lemm = nltk.stem.WordNetLemmatizer()
    dataset['text'] = list(map(lambda word:
                                     list(map(lemm.lemmatize, word)),
                                     dataset.text))
    
lemmatized_words(data_train_lem)
lemmatized_words(data_test_lem)

def join_tokens(text):
    text = " ".join(text)
    text.strip()
    return text

data_train_lem["text"] = list(map(join_tokens, data_train_lem.text))
data_train_lem["text"] = data_train_lem["text"].str.strip()

data_test_lem["text"] = list(map(join_tokens, data_test_lem.text))
data_test_lem["text"] = data_test_lem["text"].str.strip()

print(data_train_lem)
print(data_test_lem)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Tommy\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                                   text  offense_rating
6433  going die get much say get decide going live d...            0.00
5353                  sometimes something worse nothing            0.00
3102  broccoli high potassium help maintain healthy ...            0.00
4712  act kindness random act kindness matter small ...            0.00
1891  number black priest small run away called fath...            3.90
...                                                 ...             ...
4902         soda contains 10 teaspoon sugar soda sugar            0.00
2684  way see sometimes get want life sometimes some...            0.00
7602  shrink console look friend find none people ca...            0.00
4764  anyone remember joke dwarf think right easy re...            1.55
1719  third time I found jeff goldblum box kellogg r...            0.00

[6400 rows x 2 columns]
                                                   text  offense_rating
7049  wow tough guy argued internet dar

#### Stemming:

In [18]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

def stem(text):
    text = ' '.join([ps.stem(w) for w in text])
    return text

data_train_stem["text"] = list(map(stem, data_train_stem.text))
data_train_stem["text"] = data_train_stem["text"].str.strip()

data_test_stem["text"] = list(map(stem, data_test_stem.text))
data_test_stem["text"] = data_test_stem["text"].str.strip()

print(data_train_stem)
print(data_test_stem)

                                                   text  offense_rating
6433        go die get much say get decid go live decid            0.00
5353                           sometim someth wors noth            0.00
3102  broccoli high potassium help maintain healthi ...            0.00
4712  act kind random act kind matter small make tre...            0.00
1891  number black priest small run away call father...            3.90
...                                                 ...             ...
4902          soda contain 10 teaspoon sugar soda sugar            0.00
2684  way see sometim get want life sometim sometim ...            0.00
7602  shrink consol look friend find none peopl capa...            0.00
4764  anyon rememb joke dwarf think right easi remem...            1.55
1719  third time I found jeff goldblum box kellogg r...            0.00

[6400 rows x 2 columns]
                                                   text  offense_rating
7049  wow tough guy argu internet dare 

### Using an SVR algorithm

In [19]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVR
import time

# This is already using the "optimal" parameters
svr_lem_model = Pipeline([('vect', CountVectorizer(ngram_range = (1, 1), stop_words='english')),
                          ('tfidf', TfidfTransformer(use_idf = False)), ('svr', SVR(kernel = "rbf", 
                            C = 1000.0, shrinking = True, gamma = "scale"))])

start_time = time.time()

svr_lem_model = svr_lem_model.fit(data_train_lem.text, data_train_lem.offense_rating)

print("Training time for the lemmatied text: %s seconds." % (time.time() - start_time))

# =================================================================================================

# This is already using the "optimal" parameters
svr_stem_model = Pipeline([('vect', CountVectorizer(ngram_range = (1, 1), stop_words='english')), 
                           ('tfidf', TfidfTransformer(use_idf = False)), ('svr', SVR(kernel = "rbf",
                            C = 1000.0, shrinking = True, gamma = "scale"))])

start_time = time.time()

svr_stem_model = svr_stem_model.fit(data_train_stem.text, data_train_stem.offense_rating)

print("Training time for the stemmed text: %s seconds." % (time.time() - start_time))


Training time for the lemmatied text: 3.4999959468841553 seconds.
Training time for the stemmed text: 3.4509966373443604 seconds.


### Grid search to determine best parameters
#### Parameters for the lemmatization model
This takes a while to run so here are the best parameters for the model used with lemmatized text:
- svr__gamma: 'scale'
- svr__kernel: 'rbf'
- svr__shrinking: True
- tfidf__use_idf: False
- vect__ngram_range: (1, 1)

In [None]:
from sklearn.model_selection import GridSearchCV
parameters_svr = {'vect__ngram_range': [(1, 1), (1, 2)],
               'tfidf__use_idf': (True, False),
               'svr__kernel': ("linear", "poly", "rbf", "sigmoid", "precomputed"),
                   'svr__shrinking': (True, False),
                   'svr__gamma': ("scale", "auto"),
}
gs_svr = GridSearchCV(svr_lem_model, parameters_svr, n_jobs=-1)
gs_svr = gs_svr.fit(data_train_lem.text, data_train_lem.offense_rating)
gs_svr.best_score_
gs_svr.best_params_

#### Parameters for the stemming model
This takes a while to run so here are the best parameters for the model used with stemmed text:
- svr__gamma: 'scale'
- svr__kernel: 'rbf'
- svr__shrinking: True
- tfidf__use_idf: False
- vect__ngram_range: (1, 1)

In [None]:
parameters_svr = {'vect__ngram_range': [(1, 1), (1, 2)],
               'tfidf__use_idf': (True, False),
               'svr__kernel': ("linear", "poly", "rbf", "sigmoid", "precomputed"),
                   'svr__shrinking': (True, False),
                   'svr__gamma': ("scale", "auto"),
}
gs_svr = GridSearchCV(svr_stem_model, parameters_svr, n_jobs=-1)
gs_svr = gs_svr.fit(data_train_stem.text, data_train_stem.offense_rating)
gs_svr.best_score_
gs_svr.best_params_

### Metrics for the SVR

#### MSE and RMSE for the SVR model trained with lemmatized text

In [20]:
from sklearn.metrics import mean_squared_error
predicted = svr_lem_model.predict(data_test_lem.text)
true = data_test_lem.offense_rating
print("Mean squared error: %s." % mean_squared_error(true, predicted))
print("Root mean squared error: %s." % mean_squared_error(true, predicted, squared = False))

Mean squared error: 0.4772971271678978.
Root mean squared error: 0.6908669388296836.


#### MSE and RMSE for the SVR model trained with stemmed text

In [21]:
from sklearn.metrics import mean_squared_error
predicted = svr_stem_model.predict(data_test_stem.text)
true = data_test_stem.offense_rating
print("Mean squared error: %s." % mean_squared_error(true, predicted))
print("Root mean squared error: %s." % mean_squared_error(true, predicted, squared = False))

Mean squared error: 0.4778143204334434.
Root mean squared error: 0.6912411449222647.
