### 1. Neural Network Classifier with Scikit

In [1]:
# Load required libraries
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.stem import SnowballStemmer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.preprocessing import StandardScaler
from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Sequential

In [2]:
data = pd.read_json(r'categorized-comments.jsonl', lines=True)

In [3]:
data_sample = data.sample(n=10000)
data_sample

Unnamed: 0,cat,txt
118579,video_games,"I still try and use my vooper in PvP, and you ..."
263253,sports,Bit concerned about hunter because his shootin...
298731,video_games,PS4 and X1 version is planned. Not released ye...
77610,video_games,"just bring metacutioner, electro wiz, and torn..."
461776,video_games,I gotcha. Well that's what's great about the S...
...,...,...
381484,video_games,Car charger and LoZ
4931,science_and_technology,Still doing the size=specs crap unfortunately.
207025,video_games,[deleted]
175714,video_games,That's why I stop buying bp :)


In [4]:
#defining class column
my_encoder = LabelEncoder()
my_class_cat = np.array(my_encoder.fit_transform(data_sample['cat'])).reshape(-1,1)
#Create a "diccionary" to translate the categories into the actual values once you have the output
my_class_decoder = list(np.unique(data_sample['cat']))

In [5]:
data_sample['cat1'] = my_encoder.fit_transform(data_sample['cat'])
data_sample

Unnamed: 0,cat,txt,cat1
118579,video_games,"I still try and use my vooper in PvP, and you ...",2
263253,sports,Bit concerned about hunter because his shootin...,1
298731,video_games,PS4 and X1 version is planned. Not released ye...,2
77610,video_games,"just bring metacutioner, electro wiz, and torn...",2
461776,video_games,I gotcha. Well that's what's great about the S...,2
...,...,...,...
381484,video_games,Car charger and LoZ,2
4931,science_and_technology,Still doing the size=specs crap unfortunately.,0
207025,video_games,[deleted],2
175714,video_games,That's why I stop buying bp :),2


In [6]:
my_class_decoder

['science_and_technology', 'sports', 'video_games']

In [7]:
#Assign the features and target
X_text_train=data_sample['txt'].values
X_text_test=data_sample['txt'].values
y=data_sample['cat1'].values
num_labels = len(np.unique(data_sample['cat1']))

In [8]:
stop_words = set(stopwords.words('english'))
stop_words.update(['.', ',', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}'])
stemmer = SnowballStemmer('english')

In [9]:
#Preprocess the text in training and testing
processed_train = []
for doc in X_text_train:
    tokens = word_tokenize(doc)
    filtered = [word for word in tokens if word not in stop_words]
    stemmed = [stemmer.stem(word) for word in filtered]
    processed_train.append(stemmed)
    
processed_test = []
for doc in X_text_test:
    tokens = word_tokenize(doc)
    filtered = [word for word in tokens if word not in stop_words]
    stemmed = [stemmer.stem(word) for word in filtered]
    processed_test.append(stemmed)

In [10]:
data_sample['processed_txt']=processed_train
data_sample.head()

Unnamed: 0,cat,txt,cat1,processed_txt
118579,video_games,"I still try and use my vooper in PvP, and you ...",2,"[i, still, tri, use, vooper, pvp, still, ``, k..."
263253,sports,Bit concerned about hunter because his shootin...,1,"[bit, concern, hunter, shoot, stat, alreadi, i..."
298731,video_games,PS4 and X1 version is planned. Not released ye...,2,"[ps4, x1, version, plan, not, releas, yet, the..."
77610,video_games,"just bring metacutioner, electro wiz, and torn...",2,"[bring, metacution, electro, wiz, tornado, pos..."
461776,video_games,I gotcha. Well that's what's great about the S...,2,"[i, gotcha, well, 's, 's, great, switch, it, c..."


In [11]:
row_lst = []
for lst in data_sample.loc[:,'processed_txt']:
    text = ''
    for word in lst:
        text = text + ' ' + word
    row_lst.append(text)

data_sample['final_processed_text'] = row_lst

In [12]:
data_sample

Unnamed: 0,cat,txt,cat1,processed_txt,final_processed_text
118579,video_games,"I still try and use my vooper in PvP, and you ...",2,"[i, still, tri, use, vooper, pvp, still, ``, k...",i still tri use vooper pvp still `` kinda '' ...
263253,sports,Bit concerned about hunter because his shootin...,1,"[bit, concern, hunter, shoot, stat, alreadi, i...",bit concern hunter shoot stat alreadi insan h...
298731,video_games,PS4 and X1 version is planned. Not released ye...,2,"[ps4, x1, version, plan, not, releas, yet, the...",ps4 x1 version plan not releas yet the pc ver...
77610,video_games,"just bring metacutioner, electro wiz, and torn...",2,"[bring, metacution, electro, wiz, tornado, pos...",bring metacution electro wiz tornado possibl ...
461776,video_games,I gotcha. Well that's what's great about the S...,2,"[i, gotcha, well, 's, 's, great, switch, it, c...",i gotcha well 's 's great switch it cater use...
...,...,...,...,...,...
381484,video_games,Car charger and LoZ,2,"[car, charger, loz]",car charger loz
4931,science_and_technology,Still doing the size=specs crap unfortunately.,0,"[still, size=spec, crap, unfortun]",still size=spec crap unfortun
207025,video_games,[deleted],2,[delet],delet
175714,video_games,That's why I stop buying bp :),2,"[that, 's, i, stop, buy, bp]",that 's i stop buy bp


In [13]:
X_train, X_test, y_train, y_test = train_test_split(data_sample['final_processed_text'],
                                                   data_sample['cat1'],
                                                   test_size=0.33,
                                                   random_state=8675309)

In [14]:
cv = CountVectorizer(stop_words='english')
cv.fit(X_train)

X_train_cv = cv.transform(X_train)
X_test_cv = cv.transform(X_test)

In [15]:
# fitting a MLP model to the data
model = MLPClassifier()
model.fit(X_train_cv, y_train)
print(); print(model)

# making predictions
expected_y  = y_test
predicted_y = model.predict(X_test_cv)

# summarizing the fit of the model
print(); print(metrics.classification_report(expected_y, predicted_y))
print(); print(metrics.confusion_matrix(expected_y, predicted_y))


MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

              precision    recall  f1-score   support

           0       0.55      0.31      0.39       143
           1       0.53      0.49      0.51       796
           2       0.81      0.85      0.83      2361

    accuracy                           0.74      3300
   macro avg       0.63      0.55      0.58      3300
weighted avg       0.73      0.74      0.73      3300


[[  44   28   71]
 [   9  393  394]
 [  27  325 2009]]


In [16]:
mlr = MLPRegressor(solver='lbfgs', alpha=1e-5, 
                     hidden_layer_sizes=(50,50), max_iter=500, random_state=1)
mlr.fit(X_train_cv, y_train) 

MLPRegressor(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(50, 50), learning_rate='constant',
             learning_rate_init=0.001, max_fun=15000, max_iter=500,
             momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
             power_t=0.5, random_state=1, shuffle=True, solver='lbfgs',
             tol=0.0001, validation_fraction=0.1, verbose=False,
             warm_start=False)

In [17]:
mlr.score(X_train_cv, y_train)

0.9199775580848606

In [18]:
mlr = MLPRegressor(solver='lbfgs', alpha=1e-5, 
                     hidden_layer_sizes=(50,50), max_iter=500, random_state=1)
mlr.fit(X_test_cv, y_test) 

MLPRegressor(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(50, 50), learning_rate='constant',
             learning_rate_init=0.001, max_fun=15000, max_iter=500,
             momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
             power_t=0.5, random_state=1, shuffle=True, solver='lbfgs',
             tol=0.0001, validation_fraction=0.1, verbose=False,
             warm_start=False)

In [19]:
mlr.score(X_test_cv, y_test)

0.9158756414655458

### 2. Neural Network Classifier with Keras

In [20]:
input_dim = X_train_cv.shape[1]
N_classes = 3

def build_network():
    nn=Sequential()
    nn.add(Dense(500, activation='relu', input_dim=input_dim))
    nn.add(Dense(150, activation='relu'))
    nn.add(Dense(N_classes, activation='softmax'))
    nn.compile(
        loss='categorical_crossentropy',
        optimizer='adam',
        metrics=['accuracy']
    )
    return nn

In [21]:
if __name__ == '__main__':
    from sklearn.pipeline import Pipeline
    pipeline = Pipeline([
        ('nn', KerasClassifier(build_fn=build_network,
                          epochs=1,
                          batch_size=1))
    ])

In [22]:
def train_model(model):
    X = X_train_cv
    y = y_train
    scores = cross_val_score(model, X, y, scoring='accuracy')
    model.fit(X,y)
    return scores

In [23]:
from keras.layers import Activation, Dense
scores = train_model(pipeline)







### 3. Classifying Images

In [24]:
import numpy as np
from tensorflow import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers.convolutional import Conv2D, MaxPooling2D
from keras.utils import np_utils
from keras import backend as K

In [25]:
# Set the color channel value to be first
K.set_image_data_format('channels_last')
#K.set_image_data_format("channels_first")

In [26]:
# Set seed
np.random.seed(0)

In [27]:
# Set image information
channels = 1
height = 28
width = 28

In [28]:
# Load data and target from MNIST data
(data_train, target_train), (data_test, target_test)= mnist.load_data()

In [29]:
# Reshape training image data into features
data_train = data_train.reshape(data_train.shape[0], height, width, channels)

In [30]:
# Reshape test image data into features
data_test = data_test.reshape(data_test.shape[0], height, width, channels)

In [31]:
# Rescale pixel intensity to between 0 and 1
features_train = data_train / 255
features_test = data_test / 255

In [32]:
# One-hot encode target 
target_train = np_utils.to_categorical(target_train)
target_test = np_utils.to_categorical(target_test)
number_of_classes = target_test.shape[1]

In [33]:
# Start neural network
network = Sequential()

In [34]:
# Add convolutional layer with 64 filters, a 5x5 window and ReLU activation function 
network.add(Conv2D(filters=64,
                  kernel_size=(5, 5),
                  input_shape=(height, width, channels),
                  activation='relu'))

In [35]:
# Add max pooling layer with a 2x2 window
network.add(MaxPooling2D(pool_size=(2, 2)))

In [36]:
# Add dropout layer
network.add(Dropout(0.5))

In [37]:
# Add layer to flatten input
network.add(Flatten())

In [38]:
# Add fully connected layer of 128 units with a ReLU activation function 
network.add(Dense(128, activation="relu"))

In [39]:
# Add dropout layer
network.add(Dropout(0.5))

In [40]:
# Add fully connected layer with a softmax activation function
network.add(Dense(number_of_classes, activation="softmax"))

In [41]:
# Compile neural network
network.compile(loss="categorical_crossentropy",
               optimizer="rmsprop",
               metrics=["accuracy"]) 

In [42]:
# Train neural network
nnModel = network.fit(features_train,
           target_train,
           epochs=2,
           verbose=0,
           batch_size=1000,
           validation_data=(features_test, target_test))

In [43]:
print(nnModel.history.keys())

dict_keys(['loss', 'accuracy', 'val_loss', 'val_accuracy'])


In [44]:
train_loss = nnModel.history['loss']
val_loss   = nnModel.history['val_loss']
train_acc  = nnModel.history['accuracy']
val_acc    = nnModel.history['val_accuracy']

In [45]:
# Training dataset accuracy
train_acc

[0.8097500205039978, 0.9396833181381226]

In [46]:
# Validation dataset accuracy
val_acc

[0.9519000053405762, 0.9704999923706055]