In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from keras.models import Model
from sklearn.metrics import classification_report
from keras import layers, models, optimizers
import sklearn.metrics as metrics

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


### For 2 labels:

In [2]:
speech_data = pd.read_csv("speech-data-two-label.csv")

# split the dataset into training and testing datasets 
x_train, x_test, y_train, y_test = train_test_split(speech_data['VideoText'], speech_data['Category'], test_size=0.3, random_state=1)


# create a count vectorizer object 
cv = CountVectorizer(stop_words='english')
cv.fit(speech_data['VideoText'])

# transform the training and testing data using count vectorizer object
x_train_count =  cv.transform(x_train)
x_test_count =  cv.transform(x_test)

tfidf_tr = TfidfTransformer()
x_train_count_tfidf = tfidf_tr.fit_transform(x_train_count)
x_test_count_tfidf = tfidf_tr.transform(x_test_count)

# ngram level tf-idf 
tv_ngram = TfidfVectorizer(ngram_range=(1,2))
tv_ngram.fit(speech_data['VideoText'])
x_train_tfidf_ngram =  tv_ngram.transform(x_train)
x_test_tfidf_ngram =  tv_ngram.transform(x_test)

In [3]:
def classifierModel(classifier, Xtrain, Ytrain, Xtest):
    classifier.fit(Xtrain, Ytrain)
    predictions = classifier.predict(Xtest)
    print("\nAccuracy : ", accuracy_score(predictions, y_test))
    
    target_names = ['class 0', 'class 1']
    print(classification_report(y_test, predictions, target_names=target_names))

In [4]:
print("\nNaive Bayes with Count Vectors: ")
classifierModel(MultinomialNB(), x_train_count, y_train, x_test_count)

print("\nNaive Bayes with Count Vectors + TF-IDF: ")
classifierModel(MultinomialNB(), x_train_count_tfidf, y_train, x_test_count_tfidf)

print("\nNaive Bayes with N-Gram Vectors: ")
classifierModel(MultinomialNB(), x_train_tfidf_ngram, y_train, x_test_tfidf_ngram)



Naive Bayes with Count Vectors: 

Accuracy :  0.8571428571428571
             precision    recall  f1-score   support

    class 0       0.97      0.84      0.90        45
    class 1       0.59      0.91      0.71        11

avg / total       0.90      0.86      0.87        56


Naive Bayes with Count Vectors + TF-IDF: 

Accuracy :  0.8571428571428571
             precision    recall  f1-score   support

    class 0       0.85      1.00      0.92        45
    class 1       1.00      0.27      0.43        11

avg / total       0.88      0.86      0.82        56


Naive Bayes with N-Gram Vectors: 

Accuracy :  0.8392857142857143
             precision    recall  f1-score   support

    class 0       0.83      1.00      0.91        45
    class 1       1.00      0.18      0.31        11

avg / total       0.87      0.84      0.79        56



In [5]:
print("\nLinearSVM with Count Vectors: ")
classifierModel(SVC(kernel='linear'), x_train_count, y_train, x_test_count)

print("\nLinearSVM with Count Vectors + TF-IDF: ")
classifierModel(SVC(kernel='linear'), x_train_count_tfidf, y_train, x_test_count_tfidf)

print("\nLinearSVM with N-Gram Vectors: ")
classifierModel(SVC(kernel='linear'), x_train_tfidf_ngram, y_train, x_test_tfidf_ngram)



LinearSVM with Count Vectors: 

Accuracy :  0.8392857142857143
             precision    recall  f1-score   support

    class 0       0.97      0.82      0.89        45
    class 1       0.56      0.91      0.69        11

avg / total       0.89      0.84      0.85        56


LinearSVM with Count Vectors + TF-IDF: 

Accuracy :  0.9107142857142857
             precision    recall  f1-score   support

    class 0       0.90      1.00      0.95        45
    class 1       1.00      0.55      0.71        11

avg / total       0.92      0.91      0.90        56


LinearSVM with N-Gram Vectors: 

Accuracy :  0.9285714285714286
             precision    recall  f1-score   support

    class 0       0.92      1.00      0.96        45
    class 1       1.00      0.64      0.78        11

avg / total       0.93      0.93      0.92        56



In [6]:
print("\nRandom Forrest Classifier with Count Vectors: ")
classifierModel(RandomForestClassifier(random_state=40,n_estimators=1024), x_train_count, y_train, x_test_count)

print("\nRandom Forrest Classifier with Count Vectors + TF-IDF: ")
classifierModel(RandomForestClassifier(random_state=40,n_estimators=1024), x_train_count_tfidf, y_train, x_test_count_tfidf)

print("\nRandom Forrest Classifier with N-Gram Vectors: ")
classifierModel(RandomForestClassifier(random_state=40,n_estimators=1024), x_train_tfidf_ngram, y_train, x_test_tfidf_ngram)



Random Forrest Classifier with Count Vectors: 

Accuracy :  0.9642857142857143
             precision    recall  f1-score   support

    class 0       1.00      0.96      0.98        45
    class 1       0.85      1.00      0.92        11

avg / total       0.97      0.96      0.97        56


Random Forrest Classifier with Count Vectors + TF-IDF: 

Accuracy :  0.9464285714285714
             precision    recall  f1-score   support

    class 0       0.94      1.00      0.97        45
    class 1       1.00      0.73      0.84        11

avg / total       0.95      0.95      0.94        56


Random Forrest Classifier with N-Gram Vectors: 

Accuracy :  0.9285714285714286
             precision    recall  f1-score   support

    class 0       0.92      1.00      0.96        45
    class 1       1.00      0.64      0.78        11

avg / total       0.93      0.93      0.92        56



In [24]:
def rnnModel(Xtrain, Ytrain, Xtest):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(1000, 64),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(10)),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])

    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    model.fit(Xtrain, Ytrain, batch_size=30, epochs=10)
    
    test_loss, test_acc = model.evaluate(Xtest, y_test)
    print("\nAccuracy : ",test_acc)
    
    predictions = model.predict(Xtest).ravel()
    
    target_names = ['class 0', 'class 1']
    print(classification_report(y_test, predictions.round(), target_names=target_names)) 

In [25]:
print("\nRNN with Count Vectors: ")
rnnModel(x_train_count, y_train, x_test_count)

print("\nRNN with  Count Vectors + TF-IDF: ")
rnnModel(x_train_count_tfidf, y_train, x_test_count_tfidf)

print("\nRNN with N-Gram Vectors: ")
rnnModel(x_train_tfidf_ngram, y_train, x_test_tfidf_ngram)


RNN with Count Vectors: 
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Accuracy :  0.8035714


  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

    class 0       0.80      1.00      0.89        45
    class 1       0.00      0.00      0.00        11

avg / total       0.65      0.80      0.72        56


RNN with  Count Vectors + TF-IDF: 
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Accuracy :  0.8035714
             precision    recall  f1-score   support

    class 0       0.80      1.00      0.89        45
    class 1       0.00      0.00      0.00        11

avg / total       0.65      0.80      0.72        56


RNN with N-Gram Vectors: 
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Accuracy :  0.8035714
             precision    recall  f1-score   support

    class 0       0.80      1.00      0.89        45
    class 1       0.00      0.00      0.00        11

avg / total       0.65      0.80      0.72        56



### For 3 Labels:

In [7]:
speech = pd.read_csv("speech-data-three-label.csv")

# split the dataset into training and testing datasets 
x_train, x_test, y_train, y_test = train_test_split(speech['VideoText'], speech['Category'], test_size=0.3, random_state=1)

# create a count vectorizer object 
cv = CountVectorizer(stop_words='english')
cv.fit(speech['VideoText'])

# transform the training and testing data using count vectorizer object
x_train_count =  cv.transform(x_train)
x_test_count =  cv.transform(x_test)

tfidf_tr = TfidfTransformer()
x_train_count_tfidf = tfidf_tr.fit_transform(x_train_count)
x_test_count_tfidf = tfidf_tr.transform(x_test_count)

# ngram level tf-idf 
tv_ngram = TfidfVectorizer(ngram_range=(1,2))
tv_ngram.fit(speech['VideoText'])
x_train_tfidf_ngram =  tv_ngram.transform(x_train)
x_test_tfidf_ngram =  tv_ngram.transform(x_test)


In [8]:
def classifierModel(classifier, Xtrain, Ytrain, Xtest):
    classifier.fit(Xtrain, Ytrain)
    predictions = classifier.predict(Xtest)
    print("\nAccuracy : ", accuracy_score(predictions, y_test))
    
    target_names = ['class 0', 'class 1', 'class 2']
    print(classification_report(y_test, predictions, target_names=target_names))

In [9]:
print("\nNaive Bayes with Count Vectors: ")
classifierModel(MultinomialNB(), x_train_count, y_train, x_test_count)

print("\nNaive Bayes with Count Vectors + TF-IDF: ")
classifierModel(MultinomialNB(), x_train_count_tfidf, y_train, x_test_count_tfidf)

print("\nNaive Bayes with N-Gram Vectors: ")
classifierModel(MultinomialNB(), x_train_tfidf_ngram, y_train, x_test_tfidf_ngram)


Naive Bayes with Count Vectors: 

Accuracy :  0.75
             precision    recall  f1-score   support

    class 0       0.97      0.80      0.88        45
    class 1       0.36      1.00      0.53         5
    class 2       0.20      0.17      0.18         6

avg / total       0.84      0.75      0.77        56


Naive Bayes with Count Vectors + TF-IDF: 

Accuracy :  0.8214285714285714
             precision    recall  f1-score   support

    class 0       0.82      1.00      0.90        45
    class 1       1.00      0.20      0.33         5
    class 2       0.00      0.00      0.00         6

avg / total       0.75      0.82      0.75        56


Naive Bayes with N-Gram Vectors: 

Accuracy :  0.8214285714285714
             precision    recall  f1-score   support

    class 0       0.82      1.00      0.90        45
    class 1       1.00      0.20      0.33         5
    class 2       0.00      0.00      0.00         6

avg / total       0.75      0.82      0.75        56



  'precision', 'predicted', average, warn_for)


In [10]:
print("\nLinearSVM with Count Vectors: ")
classifierModel(LinearSVC(), x_train_count, y_train, x_test_count)

print("\nLinearSVM with Count Vectors + TF-IDF: ")
classifierModel(LinearSVC(), x_train_count_tfidf, y_train, x_test_count_tfidf)

print("\nLinearSVM with N-Gram Vectors: ")
classifierModel(LinearSVC(), x_train_tfidf_ngram, y_train, x_test_tfidf_ngram)



LinearSVM with Count Vectors: 

Accuracy :  0.8214285714285714
             precision    recall  f1-score   support

    class 0       0.98      0.89      0.93        45
    class 1       0.33      0.80      0.47         5
    class 2       0.67      0.33      0.44         6

avg / total       0.89      0.82      0.84        56


LinearSVM with Count Vectors + TF-IDF: 

Accuracy :  0.8571428571428571
             precision    recall  f1-score   support

    class 0       0.87      1.00      0.93        45
    class 1       0.75      0.60      0.67         5
    class 2       0.00      0.00      0.00         6

avg / total       0.76      0.86      0.81        56


LinearSVM with N-Gram Vectors: 

Accuracy :  0.8571428571428571
             precision    recall  f1-score   support

    class 0       0.85      1.00      0.92        45
    class 1       1.00      0.40      0.57         5
    class 2       1.00      0.17      0.29         6

avg / total       0.88      0.86      0.82      

  'precision', 'predicted', average, warn_for)


In [30]:
print("\nLinearSVM with Count Vectors: ")
classifierModel(SVC(kernel='linear'), x_train_count, y_train, x_test_count)

print("\nLinearSVM with Count Vectors + TF-IDF: ")
classifierModel(SVC(kernel='linear'), x_train_count_tfidf, y_train, x_test_count_tfidf)

print("\nLinearSVM with N-Gram Vectors: ")
classifierModel(SVC(kernel='linear'), x_train_tfidf_ngram, y_train, x_test_tfidf_ngram)


LinearSVM with Count Vectors: 

Accuracy :  0.7678571428571429
             precision    recall  f1-score   support

    class 0       0.97      0.82      0.89        45
    class 1       0.27      0.80      0.40         5
    class 2       0.67      0.33      0.44         6

avg / total       0.88      0.77      0.80        56


LinearSVM with Count Vectors + TF-IDF: 

Accuracy :  0.8571428571428571
             precision    recall  f1-score   support

    class 0       0.87      1.00      0.93        45
    class 1       0.75      0.60      0.67         5
    class 2       0.00      0.00      0.00         6

avg / total       0.76      0.86      0.81        56


LinearSVM with N-Gram Vectors: 

Accuracy :  0.8392857142857143
             precision    recall  f1-score   support

    class 0       0.85      1.00      0.92        45
    class 1       0.67      0.40      0.50         5
    class 2       0.00      0.00      0.00         6

avg / total       0.74      0.84      0.78      

  'precision', 'predicted', average, warn_for)


In [11]:
print("\nRandom Forrest Classifier with Count Vectors: ")
classifierModel(RandomForestClassifier(random_state=40,n_estimators=1024), x_train_count, y_train, x_test_count)

print("\nRandom Forrest Classifier with Count Vectors + TF-IDF: ")
classifierModel(RandomForestClassifier(random_state=40,n_estimators=1024), x_train_count_tfidf, y_train, x_test_count_tfidf)

print("\nRandom Forrest Classifier with N-Gram Vectors: ")
classifierModel(RandomForestClassifier(random_state=40,n_estimators=1024), x_train_tfidf_ngram, y_train, x_test_tfidf_ngram)



Random Forrest Classifier with Count Vectors: 

Accuracy :  0.875
             precision    recall  f1-score   support

    class 0       0.96      1.00      0.98        45
    class 1       0.44      0.80      0.57         5
    class 2       0.00      0.00      0.00         6

avg / total       0.81      0.88      0.84        56


Random Forrest Classifier with Count Vectors + TF-IDF: 


  'precision', 'predicted', average, warn_for)



Accuracy :  0.8571428571428571
             precision    recall  f1-score   support

    class 0       0.90      1.00      0.95        45
    class 1       0.50      0.60      0.55         5
    class 2       0.00      0.00      0.00         6

avg / total       0.77      0.86      0.81        56


Random Forrest Classifier with N-Gram Vectors: 


  'precision', 'predicted', average, warn_for)



Accuracy :  0.8392857142857143
             precision    recall  f1-score   support

    class 0       0.85      1.00      0.92        45
    class 1       0.67      0.40      0.50         5
    class 2       0.00      0.00      0.00         6

avg / total       0.74      0.84      0.78        56



  'precision', 'predicted', average, warn_for)


In [32]:
def rnnModel(Xtrain, Ytrain, Xtest):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(1000, 64),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(10)),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])

    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    model.fit(Xtrain, Ytrain, batch_size=30, epochs=10)
    
    test_loss, test_acc = model.evaluate(Xtest, y_test)
    print("\nAccuracy : ",test_acc)
    
    predictions = model.predict(Xtest)
    
    target_names = ['class 0', 'class 1', 'class 2']
    print(classification_report(y_test, predictions.round(), target_names=target_names)) 
    

In [33]:
print("\nRNN with Count Vectors: ")
rnnModel(x_train_count, y_train, x_test_count)

print("\nRNN with  Count Vectors + TF-IDF: ")
rnnModel(x_train_count_tfidf, y_train, x_test_count_tfidf)

print("\nRNN with N-Gram Vectors: ")
rnnModel(x_train_tfidf_ngram, y_train, x_test_tfidf_ngram)


RNN with Count Vectors: 
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Accuracy :  0.8035714


  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

    class 0       0.80      1.00      0.89        45
    class 1       0.00      0.00      0.00         5
    class 2       0.00      0.00      0.00         6

avg / total       0.65      0.80      0.72        56


RNN with  Count Vectors + TF-IDF: 
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Accuracy :  0.8035714
             precision    recall  f1-score   support

    class 0       0.80      1.00      0.89        45
    class 1       0.00      0.00      0.00         5
    class 2       0.00      0.00      0.00         6

avg / total       0.65      0.80      0.72        56


RNN with N-Gram Vectors: 
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Accuracy :  0.8035714
             precision    recall  f1-score   support

    class 0       0.80      1.00      0.89        45
    class 1       0.00    