### BELLEVUE UNIVERSITY
### DSC-550 Data Mining
### Name: Tai Ngo
### Date: 5/18/2020

### 9.3 Exercise: Neural Network Classifiers

### 1. Neural Network Classifier with Scikit-Learn

In [1]:
# load libraries
import pandas as pd
import numpy as np
import json
import re
import pickle
import nltk
import sklearn

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, auc, precision_recall_fscore_support
from sklearn.metrics import classification_report
from sklearn.neural_network import MLPClassifier

In [2]:
# Read data
def read_data(file):
    '''
    Take a json file location and
    read the file into a pandas data frame
    Args: full path to file
    Returns: pandas dataframe with data from file
    '''
    
    data = []
    with open(file) as f:
        for line in f:
            data.append(json.loads(line))
    # convert to dataframe
    return pd.DataFrame(data)

In [3]:
# load data into a dataframe
con_df = pd.read_json("controversial-comments.jsonl", lines=True)

# check size, structure and categories

print('Size: ', len(con_df), '\n',
      'Shape: ', con_df.info(), '\n',
      'Categories: ', con_df.con.unique())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 950000 entries, 0 to 949999
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   con     950000 non-null  int64 
 1   txt     950000 non-null  object
dtypes: int64(1), object(1)
memory usage: 14.5+ MB
Size:  950000 
 Shape:  None 
 Categories:  [0 1]


In [4]:
# pre-process data
def clean_text(text):
    """
    Remove punctuations and special characters, makes lower case
    Args: text 
    Output: text
    """
    
    text=text.lower()
    text=re.sub('</?.*?>',' <>', text)
    text=re.sub('\\d|\\W+|_',' ',text)
    text=re.sub('[^a-zA-Z]'," ", text)
    
    return text

In [5]:
# create stop word list
stop_words = stopwords.words('english')

size = 10000    # sample size
replace = True  # with replacement
fn = lambda obj: obj.loc[np.random.choice(obj.index, size, replace),:]

controversy = con_df.groupby('con', as_index=False).apply(fn)

In [6]:
# free up memory

del con_df

controversy['txt'] = controversy['txt'].apply(lambda x:clean_text(x))
controversy.reset_index(drop=True, inplace=True)

controversy.head()

Unnamed: 0,con,txt
0,0,i m going to upvote any comment featuring trai...
1,0,s
2,0,generation x now fucked by millennials too
3,0,yeah but we all know births are rigged
4,0,there is no d chess just a monkey shitting on...


In [7]:
# documents() to retrieve the pickled, part-of-speech tagged documents from the corpus reader object
def documents(corpus):
    return list(corpus.reviews())

# continuous() to get the original numeric ratings
def continuous(corpus):
    return list(corpus.scores())

# make_categorical() to use Numpy's digitize method to bin the ratings
def make_categorical(corpus):
    return np.digitize(continuous(corpus), [0, 1])

In [8]:
# use the PickledReviewsReader() from the database
import pickle
from nltk.corpus.reader.api import CorpusReader

PKL_PATTERN = r'(?!\.)[\w\s\d\-]+\.pickle'

class PickledReviewsReader(CorpusReader):
    def __init__(self, root, fileids=PKL_PATTERN, **kwargs):
        """
        Initialize the corpus reader
        """
        CorpusReader.__init__(self, root, fileids, **kwargs)

    def texts_scores(self, fileids=None):
        """
        Returns the document loaded from a pickled object for every file in
        the corpus. Similar to the SqliteCorpusReader, this uses a generator
        to achieve memory safe iteration.
        """
        # Create a generator, loading one document into memory at a time.
        for path, enc, fileid in self.abspaths(fileids, True, True):
            with open(path, 'rb') as f:
                yield pickle.load(f)

    def reviews(self, fileids=None):
        """
        Returns a generator of paragraphs where each paragraph is a list of
        sentences, which is in turn a list of (token, tag) tuples.
        """
        for text,score in self.texts_scores(fileids):
            yield text

    def scores(self, fileids=None):
        """
        Return the scores
        """
        for text,score in self.texts_scores(fileids):
            yield score

    def paras(self, fileids=None):
        """
        Returns a generator of paragraphs where each paragraph is a list of
        sentences, which is in turn a list of (token, tag) tuples.
        """
        for review in self.reviews(fileids):
            for paragraph in review:
                yield paragraph

    def sents(self, fileids=None):
        """
        Returns a generator of sentences where each sentence is a list of
        (token, tag) tuples.
        """
        for paragraph in self.paras(fileids):
            for sentence in paragraph:
                yield sentence

    def tagged(self, fileids=None):
        for sent in self.sents(fileids):
            for token in sent:
                yield token

    def words(self, fileids=None):
        """
        Returns a generator of (token, tag) tuples.
        """
        for token in self.tagged(fileids):
            yield token[0]

In [9]:
# use the TextNormalizer() from the database
from sklearn.base import BaseEstimator, TransformerMixin

class TextNormalizer(BaseEstimator, TransformerMixin):

    def __init__(self, language='english'):
        self.stopwords  = set(nltk.corpus.stopwords.words(language))
        self.lemmatizer = WordNetLemmatizer()

    def is_punct(self, token):
        return all(
            unicodedata.category(char).startswith('P') for char in token
        )

    def is_stopword(self, token):
        return token.lower() in self.stopwords

    def normalize(self, document):
        return [
            self.lemmatize(token, tag).lower()
            for sentence in document
            for (token, tag) in sentence
            if not self.is_punct(token)
               and not self.is_stopword(token)
        ]

    def lemmatize(self, token, pos_tag):
        tag = {
            'N': wn.NOUN,
            'V': wn.VERB,
            'R': wn.ADV,
            'J': wn.ADJ
        }.get(pos_tag[0], wn.NOUN)

        return self.lemmatizer.lemmatize(token, tag)

    def fit(self, documents, y=None):
        return self

    def transform(self, documents):
        return [
            ' '.join(self.normalize(doc)) for doc in documents
        ]

In [10]:
# assign a variable to load into the ML model
corpus = controversy
X = corpus['txt']
y = corpus['con']

In [11]:
# load library
from sklearn.feature_extraction.text import CountVectorizer

In [12]:
# convert text to numbers
vectorizer = CountVectorizer(max_features=5000, min_df=0.5, max_df=10, stop_words=stopwords.words('english'))
X = vectorizer.fit_transform(corpus).toarray()
X

array([[1, 0],
       [0, 1]], dtype=int64)

In [13]:
# convert values into tdidf values
from sklearn.feature_extraction.text import TfidfTransformer
tfidfconverter = TfidfTransformer()
X = tfidfconverter.fit_transform(X).toarray()
X

array([[1., 0.],
       [0., 1.]])

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidfconverter = TfidfVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
X = tfidfconverter.fit_transform(corpus['txt']).toarray()
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [15]:
# divide the data into training (80%) and testing sets (20%)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [16]:
# use Random Forest Algorithm
from sklearn.ensemble import RandomForestClassifier

In [17]:
# train the algorithm
classifier = RandomForestClassifier(n_estimators=100, random_state=0)
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [18]:
# make prediction
y_pred = classifier.predict(X_test)

In [19]:
# evaluate the performance of the model
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[1178  816]
 [ 750 1256]]
              precision    recall  f1-score   support

           0       0.61      0.59      0.60      1994
           1       0.61      0.63      0.62      2006

    accuracy                           0.61      4000
   macro avg       0.61      0.61      0.61      4000
weighted avg       0.61      0.61      0.61      4000

0.6085


#### The accuracy score is 0.6085. I tried to use higher sample size, but I got lower accuracy score. My guess is that as the data starts with con=0, it goes up to about half the data size, then con=1; it is possible that as the sample size gets bigger, more data gets into con=1 territory, the accuracy drops.