In [1]:
import numpy as np
import pandas as pd

In [None]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer

In [32]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix

In [3]:
data = pd.read_csv("data/Corporate-messaging-DFE.csv",encoding='iso-8859-1')
data.head()

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,category,category:confidence,category_gold,id,screenname,text
0,662822308,False,finalized,3,2/18/15 4:31,Information,1.0,,4.36528e+17,Barclays,Barclays CEO stresses the importance of regula...
1,662822309,False,finalized,3,2/18/15 13:55,Information,1.0,,3.86013e+17,Barclays,Barclays announces result of Rights Issue http...
2,662822310,False,finalized,3,2/18/15 8:43,Information,1.0,,3.7958e+17,Barclays,Barclays publishes its prospectus for its å£5....
3,662822311,False,finalized,3,2/18/15 9:13,Information,1.0,,3.6753e+17,Barclays,Barclays Group Finance Director Chris Lucas is...
4,662822312,False,finalized,3,2/18/15 6:48,Information,1.0,,3.60385e+17,Barclays,Barclays announces that Irene McDermott Brown ...


In [4]:
data.columns

Index(['_unit_id', '_golden', '_unit_state', '_trusted_judgments',
       '_last_judgment_at', 'category', 'category:confidence', 'category_gold',
       'id', 'screenname', 'text'],
      dtype='object')

In [5]:
data['category'].value_counts()

Information    2129
Action          724
Dialogue        226
Exclude          39
Name: category, dtype: int64

In [7]:
data['category'].value_counts()

Information    1823
Action          456
Dialogue        124
Name: category, dtype: int64

In [10]:
x = data['text'].values
y = data['category'].values

In [15]:
# Regular Expression to identify the urls in the text and then replace them with a constant text
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

In [38]:
def load_data():
    '''
    Function to Load the data
    '''
    df = pd.read_csv("data/Corporate-messaging-DFE.csv",encoding='iso-8859-1')
    df = df[(df["category:confidence"] == 1) & (df['category'] != 'Exclude')]
    x = df.text.values
    y = df.category.values
    return x, y

In [37]:
def tokenize_text(text):
    '''
    Function to tokenize the text using Word Net Lemmatizer
    I/p:
        text: text strings
    O/p:
        clean_tokens: lematized list of tokens in the text
    '''
    # get list of all urls using regex
    detected_urls = re.findall(url_regex, text)
    
    # replace each url in text string with placeholder
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")

    # tokenize text
    tokens = word_tokenize(text)
    
    # initiate lemmatizer
    lemmatizer = WordNetLemmatizer()

    # iterate through each token
    lematized_tokens = []
    for token in tokens:
        
        # lemmatize, normalize case, and remove leading/trailing white space
        new_token = lemmatizer.lemmatize(token).lower().strip()
        lematized_tokens.append(new_token)

    return lematized_tokens

In [39]:
def build_model():
    '''
    Function to build the model and perform the grid search cv to select the best hyperparameters
    '''
    # we'll use pipeline to chain the transformer estimators and predictor estimators
    pipeline = Pipeline([
        ('features', FeatureUnion([

            ('text_pipeline', Pipeline([
                ('vect', CountVectorizer(tokenizer=tokenize_text)),
                ('tfidf', TfidfTransformer())
            ])),
        ])),

        ('clf', RandomForestClassifier())
    ])

    parameters = {
        'clf__n_estimators': [50, 100, 200],
        'clf__min_samples_split': [2, 3, 4]
    }

    cv = GridSearchCV(pipeline, param_grid=parameters)

    return cv

In [40]:
def display_results(cv, y_test, y_pred):
    labels = np.unique(y_pred)
    confusion_mat = confusion_matrix(y_test, y_pred, labels=labels)
    accuracy = (y_pred == y_test).mean()

    print("Labels:", labels)
    print("Confusion Matrix:\n", confusion_mat)
    print("Accuracy:", accuracy)
    print("\nBest Parameters:", cv.best_params_)

In [41]:
def main():
    '''
    Main Function which calls all the sub functions
    '''
    x, y = load_data()
    x_train, x_test, y_train, y_test = train_test_split(x, y)

    model = build_model()
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)

    display_results(model, y_test, y_pred)

In [42]:

main()

Labels: ['Action' 'Dialogue' 'Information']
Confusion Matrix:
 [[ 86   0  32]
 [  2  24   7]
 [  2   0 448]]
Accuracy: 0.9284525790349417

Best Parameters: {'clf__min_samples_split': 2, 'clf__n_estimators': 50}
