# Detecting Insults in Social Networks

![](https://pbs.twimg.com/media/CkEyfjKUUAURpd9.jpg)

In [None]:
import re, string # re: regular expression library

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC

##  Load raw data

In [None]:
def clean_text(text):
    text = text.lower()
    text = re.findall(r'\b[a-z]+\b', text)
    return ' '.join(text)

In [None]:
training_data = pd.read_csv('train.csv')
training_data.head(7)

In [None]:
training_data['cleaned_comment'] = training_data['Comment'].map(clean_text)

In [None]:
training_data.head(7)

How many comments do we have?

In [None]:
training_data.shape

How many insults do we have? What values do we have? How many?

In [None]:
training_data['Insult'].unique()

In [None]:
training_data['Insult'].sum()

Lets see some of it

In [None]:
training_data[training_data['Insult'] == 1].head()

In [None]:
training_data['Insult'].value_counts()

In [None]:
training_data['Insult'].mean()

In [None]:
import matplotlib
%matplotlib inline

In [None]:
agg_function = {'Insult': ['sum', 'count']}
training_data.groupby(pd.to_datetime(training_data.Date).dt.hour).agg(agg_function).plot(title='Total insults per hour VS Total comments')

In [None]:
training_data.groupby(pd.to_datetime(training_data.Date).dt.month).agg(agg_function).plot(title='Total insults per month VS Total comments')

## Make some features

We use de "Bag of words model" to represent the tweets ussing scikit-learn feature_extraction module.

In [None]:
count_vectorizer = CountVectorizer(analyzer='word', ngram_range=(1,3), stop_words='english', max_features=50000)
count_vectorizer.fit(training_data['cleaned_comment'])

In [None]:
dir(count_vectorizer)

In [None]:
count_vectorizer.vocabulary_

In [None]:
X = count_vectorizer.transform(training_data['cleaned_comment'])
y = training_data['Insult']

In [None]:
X

## Cross-validate

### Split data

In [None]:
import numpy as np

In [None]:
mask = [bool(np.random.binomial(1, .75)) for _ in range(X.shape[0])]

In [None]:
mask

In [None]:
mask = np.array(mask)
X[mask]

In [None]:
sum(mask) 

In [None]:
y[mask].shape[0]

In [None]:
y[~mask].shape[0]

In [None]:
def split_data(X, y, p=.75):
    mask = np.array([bool(np.random.binomial(1, p)) for _ in range(X.shape[0])])
    
    X_train = X[mask]
    y_train = y[mask]
    X_validation = X[~mask]
    y_validation = y[~mask]
    
    return X_train, y_train, X_validation, y_validation

In [None]:
X_train, y_train, X_validation, y_validation = split_data(X, y)

### Fit a model on training data

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)

Our model, our coefficients of wx +b are:

In [None]:
model.coef_

and b:

In [None]:
model.intercept_

### Validate model on validation data

In [None]:
from sklearn.metrics import accuracy_score


predictions = model.predict(X_validation)
validation_score = accuracy_score(y_validation, predictions)

print('Validation Score:', validation_score)

In [None]:
predictions.mean()

In [None]:
baseline_predictions = np.zeros(predictions.shape[0])

In [None]:
baseline_validation_score = accuracy_score(y_validation, baseline_predictions)

print('Validation Score:', baseline_validation_score)

So our regression model is better than predict  same value (0) for all cases

## Remember, everything is a hyper-parameter..

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

In [None]:
class PredictionPipeline:
    
    def __init__(self, ngram_range, vectorizer_class, model_class, training_data):
        self.ngram_range=ngram_range
        self.vectorizer_class=vectorizer_class
        self.model_class=model_class
        self.training_data=training_data
        self.vectorizer = None
        self.X = None
        self.y = None
        self.model = None
        self.validation_score = None
        
    def run(self):
        self._fit_vectorizer()
        self._featurize_text()
        self._split_train_and_validation_sets()
        self._fit_model_on_training_data()
        self._validate_model_on_validation_set()
        
        print(
            """
            Vectorizer Class: {vectorizer_class}\n\
            N-gram Range: {ngram_range}\n\
            Model Class: {model_class}\n\
            Validation Score: {validation_score}
            """.format(

            vectorizer_class=repr(self.vectorizer_class.__name__), 
            ngram_range=self.ngram_range, 
            model_class=repr(self.model_class.__name__), 
            validation_score=round(self.validation_score, 4)

            )
        )

    def _fit_vectorizer(self):
        self.vectorizer = vectorizer_class(analyzer='word', ngram_range=ngram_range, 
                                     stop_words='english', max_features=50000)
        self.vectorizer.fit(self.training_data['cleaned_comment'])
    
    def _featurize_text(self):
        self.X = self.vectorizer.transform(self.training_data['cleaned_comment'])
        self.y = self.training_data['Insult']

    def _split_train_and_validation_sets(self):
        self.X_train, self.y_train, self.X_validation, self.y_validation = split_data(
            self.X, self.y)

    def _fit_model_on_training_data(self):
        self.model = self.model_class()
        self.model.fit(self.X_train, self.y_train)

    def _validate_model_on_validation_set(self):
        predictions = self.model.predict(self.X_validation)
        self.validation_score = accuracy_score(self.y_validation, predictions)      

In [None]:
results = {}

for ngram_range in [(1, 1), (1, 2), (1, 3), (1, 4)]:
    for vectorizer_class in [CountVectorizer, TfidfVectorizer]:
        for model_class in [LogisticRegression, LinearSVC, RandomForestClassifier]:
            
            # run prediction pipeline
            prediction_pipeline = PredictionPipeline(
                ngram_range=ngram_range,
                vectorizer_class=vectorizer_class,
                model_class=model_class,
                training_data=training_data
            )
            
            prediction_pipeline.run()
            
            # add hyper-parameters to `results` dictionary
            results[str(prediction_pipeline.validation_score)] = {
                    'vectorizer_class': prediction_pipeline.vectorizer_class,
                    'ngram_range': prediction_pipeline.ngram_range,
                    'model_class': prediction_pipeline.model_class
            }

In [None]:
top_3_scores = sorted(results.keys(), reverse=True)[:3]

for score in top_3_scores:
    print('Score: {score}\nParameters: {parameters}\n'.format(
        score=score, parameters=results[score]))

## Train final model

In [None]:
top_score_key = top_3_scores[0]

In [None]:
vectorizer_class = results[top_score_key]['vectorizer_class']
ngram_range = results[top_score_key]['ngram_range']
model_class = results[top_score_key]['model_class']

# fit vectorizer
vectorizer = vectorizer_class(analyzer='word', ngram_range=ngram_range, stop_words='english', max_features=50000)
vectorizer.fit(training_data['cleaned_comment'])

# transform text
X = vectorizer.transform(training_data['cleaned_comment'])
y = training_data['Insult']

# fit model on training data
model = model_class()
model.fit(X, y)

## Run it live

In [None]:
while True:
    input_string = input('Please enter a string: ')
    input_string = clean_text(input_string)
    x_test = vectorizer.transform([input_string])
    
    prediction = model.predict(x_test)[0]
    print('Insult?: {}'.format( bool(prediction)))