In [1]:
import numpy as np 
import pandas as pd 
from nltk.tokenize.regexp import WordPunctTokenizer
import spacy
import re
from collections import defaultdict
from pandasql import sqldf

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay


In [2]:
data = pd.read_csv("/Users/timxymo/Dropbox/UA/2022 Spring/LING 539/Final Project/raw_text.csv")
data.head()

Unnamed: 0,text,label
0,The following content is\nprovided under a Cre...,Calculus
1,"In this sequence of segments,\nwe review some ...",Probability
2,The following content is\nprovided under a Cre...,CS
3,The following\ncontent is provided under a Cre...,Algorithms
4,The following\ncontent is provided under a Cre...,Algorithms


In [90]:
data[data['label']=='Statistics']

Unnamed: 0,text,label
23,The following content is\nprovided under a Cre...,Statistics
46,The following\ncontent is provided under a Cre...,Statistics
58,Let's explore what happens to\ndeterminants wh...,Statistics
73,The following content is\nprovided under a Cre...,Statistics
77,"You now know what a\ntransformation is, so l...",Statistics
...,...,...
828,The following content is\nprovided under a Cre...,Statistics
836,The following content is\nprovided under a Cre...,Statistics
838,The following content is\nprovided under a Cre...,Statistics
839,INTRODUCTION: The\nfollowing content is provid...,Statistics


In [91]:
data.iloc[23,0]

"The following content is\nprovided under a Creative Commons license. Your support will help\nMIT OpenCourseWare continue to offer high quality\neducational resources for free. To make a donation or to\nview additional materials from hundreds of MIT courses,\nvisit MIT OpenCourseWare at ocw.mit.edu. PROFESSOR: So I'm using\na few things here, right? I'm using the fact that\nKL is non-negative. But KL is equal to 0 when I\ntake twice the same argument. So I know that this function\nis always non-negative. So that's theta and that's\nKL P theta star P theta. And I know that at theta\nstar, it's equal to 0. OK? I could be in the case\nwhere I have this happening. I have two-- let's call\nit theta star prime. I have two minimizers. That could be the case, right? I'm not saying\nthat-- so K of L-- KL is 0 at the minimum. That doesn't mean that I\nhave a unit minimum, right? But it does, actually. What do I need to\nuse to make sure that I have only one minimum? So the definiteness\nis guara

In [3]:
pysqldf = lambda q: sqldf(q, globals())
q = """SELECT label, count(*) 
       FROM data
       group by label
       """

pysqldf(q)

Unnamed: 0,label,count(*)
0,AI,48
1,Algorithms,81
2,CS,104
3,Calculus,70
4,Data Structures,62
5,Diff. Eq.,93
6,Linear Algebra,152
7,Math for Eng.,28
8,NLP,19
9,Probability,124


In [4]:
tags = list(set(data.label))
print(tags)

['AI', 'Statistics', 'NLP', 'Calculus', 'Diff. Eq.', 'Algorithms', 'Data Structures', 'Linear Algebra', 'Math for Eng.', 'Probability', 'CS']


In [5]:
pysqldf = lambda q: sqldf(q, globals())
q = """SELECT *,
       case 
       when label in ('Linear Algebra', 'Math for Eng.','Diff. Eq.','Calculus') THEN 'Math'
       when label in ('CS','Data Structures', 'AI','NLP', 'Algorithms') THEN 'Computer Science'
       when label in ('Statistics','Probability') THEN 'Statistics' 
       END AS gold_label
       FROM data
       """

temp = pysqldf(q)
temp.head()

Unnamed: 0,text,label,gold_label
0,The following content is\nprovided under a Cre...,Calculus,Math
1,"In this sequence of segments,\nwe review some ...",Probability,Statistics
2,The following content is\nprovided under a Cre...,CS,Computer Science
3,The following\ncontent is provided under a Cre...,Algorithms,Computer Science
4,The following\ncontent is provided under a Cre...,Algorithms,Computer Science


In [6]:
pysqldf = lambda q: sqldf(q, globals())
q = """SELECT text, gold_label as label
       FROM temp
       """
df = pysqldf(q)
df.head()

Unnamed: 0,text,label
0,The following content is\nprovided under a Cre...,Math
1,"In this sequence of segments,\nwe review some ...",Statistics
2,The following content is\nprovided under a Cre...,Computer Science
3,The following\ncontent is provided under a Cre...,Computer Science
4,The following\ncontent is provided under a Cre...,Computer Science


In [7]:
tags = list(set(df.label))
print(tags)

['Computer Science', 'Statistics', 'Math']


In [8]:
index_to_tags_dict = {i:tag for i,tag in enumerate(tags)}
tags_to_index_dict = {tag:i for i,tag in enumerate(tags)}

print(index_to_tags_dict)
print(tags_to_index_dict)

{0: 'Computer Science', 1: 'Statistics', 2: 'Math'}
{'Computer Science': 0, 'Statistics': 1, 'Math': 2}


In [9]:
en = spacy.load('en_core_web_sm')
stopwords = en.Defaults.stop_words

In [10]:
bag_of_words = {}
sentence_tokens = []
for text in df.text:
    tokens = WordPunctTokenizer().tokenize(text)
#     doc = nlp(text)
#     tokens = [token.text for token in doc] # too slow
    token_list = []
    for token in tokens:
        token = token.lower()
        if token not in '''!()-[]{};:'"\,<>./?@#$%^&*_~''':
            try:
                int(token)
                continue
            except:
                token_list.append(token)
                bag_of_words[token] = bag_of_words.get(token,0)+1
    sentence_tokens.append(token_list)

print(len(bag_of_words))

23769


In [11]:
bag_of_words

{'the': 238921,
 'following': 1233,
 'content': 516,
 'is': 121104,
 'provided': 464,
 'under': 1166,
 'a': 108551,
 'creative': 395,
 'commons': 384,
 'license': 426,
 'your': 8023,
 'support': 703,
 'will': 15857,
 'help': 918,
 'mit': 1775,
 'opencourseware': 776,
 'continue': 705,
 'to': 150006,
 'offer': 380,
 'high': 1231,
 'quality': 434,
 'educational': 367,
 'resources': 424,
 'for': 29891,
 'free': 981,
 'make': 6679,
 'donation': 357,
 'or': 17500,
 'view': 832,
 'additional': 581,
 'materials': 363,
 'from': 12715,
 'hundreds': 394,
 'of': 125267,
 'courses': 439,
 'visit': 673,
 'at': 22166,
 'ocw': 385,
 'edu': 418,
 'professor': 5854,
 'so': 94931,
 'jerison': 31,
 'relaxing': 25,
 'in': 61516,
 'sunny': 5,
 'london': 6,
 'ontario': 3,
 'today': 1471,
 'and': 124804,
 'sent': 139,
 'me': 9235,
 'as': 20199,
 'his': 466,
 'substitute': 232,
 'again': 4806,
 'i': 100988,
 'm': 19522,
 'glad': 24,
 'here': 29712,
 'see': 10623,
 'you': 93886,
 'all': 22555,
 'our': 7144,
 '

In [12]:
# delete the stop words
for word in stopwords:
    try:
        del bag_of_words[word]
    except:
        print(word,"is not in our bag of words")

len(bag_of_words)    

â€™s is not in our bag of words
â€™ll is not in our bag of words
're is not in our bag of words
thru is not in our bag of words
â€™d is not in our bag of words
whereafter is not in our bag of words
â€™m is not in our bag of words
nâ€™t is not in our bag of words
'm is not in our bag of words
â€™ve is not in our bag of words
â€™re is not in our bag of words
â€˜ll is not in our bag of words
'd is not in our bag of words
thence is not in our bag of words
thereupon is not in our bag of words
nâ€˜t is not in our bag of words
n't is not in our bag of words
latterly is not in our bag of words
â€˜re is not in our bag of words
hereafter is not in our bag of words
â€˜m is not in our bag of words
hereupon is not in our bag of words
â€˜d is not in our bag of words
whither is not in our bag of words
've is not in our bag of words
herein is not in our bag of words
seeming is not in our bag of words
â€˜ve is not in our bag of words
â€˜s is not in our bag of words
whence is not in our bag of words
's 

23475

In [13]:
bigrams = [token for token in bag_of_words.keys() if len(token) <= 2]

for word in bigrams:
    try:
        del bag_of_words[word]
    except:
        print(word,"is not in our bag of words")

len(bag_of_words)   

22504

In [14]:
a_subset = {key: value for key, value in bag_of_words.items() if value >= 5}
n = len(a_subset)
n

10025

In [15]:
top_n_words = sorted(bag_of_words.items(), key = lambda item: item[1], reverse = True)[:n]
top_n_words_to_index = {item[0]:i for i,item in enumerate(top_n_words)}


In [16]:
# Processed Subtitles

subtitles = []
for token_list in sentence_tokens:
    sub = []
    for token in token_list:
        if top_n_words_to_index.get(token,-1) != -1:
            sub.append(token)
    sub = " ".join(sub)
    subtitles.append(sub)

# remove the sountrack watermark 
temp_sub = [subtitle.replace("following content provided creative commons license support help mit opencourseware continue offer high quality educational resources free donation view additional materials hundreds mit courses visit mit opencourseware ocw mit edu ",'') for subtitle in subtitles]
subtitles = temp_sub
subtitles[0]


'professor professor jerison relaxing sunny london today sent substitute glad agenda today said talked power series taylor formula guess week right friday going little examples applications course evaluation survey hand minutes class handout says end term didn pick coming grab going people tend pick walk grab going things missing decided office hours end term hasn decided check website information looking forward final exam aren questions technical stuff right let talk power series little bit thought review story power series attention power series way writing function sum integral powers a_0 a_1 numbers example power series polynomial forgotten type power series goes finite number terms ends higher a_i perfectly good example power series special kind power series want tell today power series behave exactly like polynomials thing careful power series isn concern polynomials minute think generalized polynomials thing careful number caution number infinity number infinity inclusive absol

In [17]:
# Word Binary, Word Count and Tfidf Features for each sentence
vectorizer = CountVectorizer(analyzer='word')
X = vectorizer.fit_transform(subtitles)
word_count_features = np.array(X.toarray())

vectorizer2 = TfidfVectorizer()
X2 =  vectorizer2.fit_transform(subtitles)
tfidf_features = np.array(X2.toarray())

print(word_count_features.shape)
print(tfidf_features.shape)

print(word_count_features[0])
print(tfidf_features[0])

(860, 9997)
(860, 9997)
[0 0 0 ... 0 0 0]
[0. 0. 0. ... 0. 0. 0.]


In [18]:
labels = [tags_to_index_dict[label] for label in df.label]
print(labels)

len(labels)

[2, 1, 0, 0, 0, 2, 0, 2, 2, 2, 2, 0, 1, 1, 0, 0, 0, 0, 0, 2, 0, 1, 2, 1, 0, 2, 2, 0, 2, 0, 1, 0, 2, 2, 1, 2, 2, 0, 1, 2, 2, 2, 2, 0, 0, 2, 1, 0, 2, 2, 0, 0, 1, 0, 0, 2, 2, 2, 1, 1, 0, 0, 0, 2, 2, 0, 2, 1, 1, 2, 2, 2, 1, 1, 2, 2, 2, 1, 0, 0, 0, 0, 2, 2, 2, 2, 0, 2, 2, 1, 2, 0, 0, 0, 0, 2, 2, 2, 2, 1, 2, 0, 0, 1, 1, 2, 0, 1, 2, 0, 2, 1, 1, 2, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 0, 2, 1, 1, 0, 1, 2, 0, 0, 1, 2, 2, 1, 1, 2, 1, 0, 1, 1, 0, 2, 1, 0, 0, 0, 2, 2, 1, 0, 0, 2, 0, 2, 0, 2, 1, 2, 0, 0, 0, 2, 0, 0, 0, 0, 2, 2, 1, 1, 2, 0, 1, 0, 2, 2, 2, 0, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 0, 0, 0, 0, 1, 0, 2, 1, 0, 1, 1, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 2, 0, 0, 1, 1, 2, 0, 0, 2, 2, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 2, 0, 1, 2, 0, 0, 2, 0, 1, 1, 0, 2, 2, 2, 1, 0, 2, 2, 2, 1, 0, 2, 2, 2, 0, 1, 0, 2, 1, 0, 2, 2, 0, 2, 0, 2, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 2, 0, 0, 1, 2, 2, 0, 1, 1, 0, 2, 1, 1, 0, 0, 1, 0, 0, 2, 0, 0, 1, 2, 0, 0, 0, 1, 0, 1, 2, 0, 1, 2, 1, 2, 2, 2, 2, 0, 0, 1, 2, 2, 2, 0, 1, 2, 0, 0, 0, 

860

In [19]:
pysqldf = lambda q: sqldf(q, globals())
q = """SELECT label, count(*) 
       FROM df
       group by label
       """

pysqldf(q)

Unnamed: 0,label,count(*)
0,Computer Science,314
1,Math,343
2,Statistics,203


In [30]:
# 860*0.7 = 602
train = 600
labels[:train].count(0) #224
labels[:train].count(1) #145
labels[:train].count(2) #231

231

In [21]:
labels[train:].count(0) #90
labels[train:].count(1) #58
labels[train:].count(2) #112

112

In [31]:
training = df.iloc[:600,]
testing = df.iloc[600:,]

In [32]:
testing

Unnamed: 0,text,label
600,The following content is\nprovided under a Cre...,Math
601,Let's keep building our table of\nLaplace tr...,Computer Science
602,Hello everyone. So far in the series on\ndata ...,Statistics
603,Say I have some matrix a --\nlet's say a is n ...,Computer Science
604,"In this lesson, we're going to write code\nto ...",Statistics
...,...,...
855,The following content is\nprovided under a Cre...,Math
856,&gt;&gt; [MUSIC] &gt;&gt; DAVID J. MALAN: All ...,Math
857,The following content is\nprovided by MIT Open...,Math
858,The following content is\nprovided under a Cre...,Computer Science


### Word Binary Features

In [24]:
# # Multiclass Logistic Classifier for Word Binary Features
# clf_logistic_wb = LogisticRegression(solver = 'lbfgs', multi_class = 'multinomial', max_iter=1000)
# clf_logistic_wb = clf_logistic_wb.fit(word_binary_features[:train], labels[:train])
# pred_logistic_wb = clf_logistic_wb.predict(word_binary_features[train:])
# accuracy_logistic_wb = np.mean(pred_logistic_wb==labels[train:])*100
# print("Accuracy =", accuracy_logistic_wb)

# cm = confusion_matrix(labels[train:], pred_logistic_wb)
# # ConfusionMatrixDisplay(cm).plot()

In [25]:
# # Mutliclass Naive Bayes Classifier for Word Binary Features
# clf_nb_wb = GaussianNB()
# clf_nb_wb = clf_nb_wb.fit(word_binary_features[:train], labels[:train])
# pred_nb_wb = clf_nb_wb.predict(word_binary_features[train:])
# accuracy_nb_wb = np.mean(pred_nb_wb==labels[train:])*100
# print("Accuracy =", accuracy_nb_wb)

# cm = confusion_matrix(labels[train:], pred_nb_wb)
# # ConfusionMatrixDisplay(cm).plot()

In [26]:
# # Random Forest Classifier for Word Binary Features
# clf_rf_wb = RandomForestClassifier(max_depth=6, random_state=0, n_estimators=100, criterion='gini')
# clf_rf_wb = clf_rf_wb.fit(word_binary_features[:train], labels[:train])
# pred_rf_wb = clf_rf_wb.predict(word_binary_features[train:])
# accuracy_rf_wb = np.mean(pred_rf_wb==labels[train:])*100
# print("Accuracy =",accuracy_rf_wb)

# cm = confusion_matrix(labels[train:], pred_rf_wb)
# cm

In [27]:
# # AdaBoost Classifier for Word Count Features
# clf_ada_wb = AdaBoostClassifier(n_estimators=100,learning_rate=1.0)
# clf_ada_wb = clf_ada_wb.fit(word_binary_features[:train], labels[:train])
# pred_ada_wb = clf_ada_wb.predict(word_binary_features[train:])
# accuracy_ada_wb = np.mean(pred_ada_wb==labels[train:])*100
# print("Accuracy =",accuracy_ada_wb)

# cm = confusion_matrix(labels[train:], pred_ada_wb)
# # ConfusionMatrixDisplay(cm).plot()

### Word Count Features

In [33]:
# Multiclass Logistic Classifier for Word Count Features
clf_logistic_wc = LogisticRegression(solver = 'lbfgs', multi_class = 'multinomial', max_iter=1000)
clf_logistic_wc = clf_logistic_wc.fit(word_count_features[:train], labels[:train])
pred_logistic_wc = clf_logistic_wc.predict(word_count_features[train:])
accuracy_logistic_wc = np.mean(pred_logistic_wc==labels[train:])*100
print("Accuracy =", accuracy_logistic_wc)

cm = confusion_matrix(labels[train:], pred_logistic_wc)
# ConfusionMatrixDisplay(cm).plot()

Accuracy = 41.15384615384615


In [74]:
# Mutliclass Naive Bayes Classifier for Word Count Features
clf_nb_wc = GaussianNB()
clf_nb_wc = clf_nb_wc.fit(word_count_features[:train], labels[:train])
pred_nb_wc = clf_nb_wc.predict(word_count_features[train:])
accuracy_nb_wc = np.mean(pred_nb_wc==labels[train:])*100
print("Accuracy =",accuracy_nb_wc)

cm = confusion_matrix(labels[train:], pred_nb_wc)
cm
# ConfusionMatrixDisplay(cm).plot()

Accuracy = 33.07692307692307


array([[32, 23, 35],
       [32,  4, 22],
       [41, 21, 50]])

In [35]:
# Random Forest Classifier for Word Count Features
clf_rf_wc = RandomForestClassifier(max_depth=8, random_state=0, n_estimators=300, criterion='gini')
clf_rf_wc = clf_rf_wc.fit(word_count_features[:train], labels[:train])
pred_rf_wc = clf_rf_wc.predict(word_count_features[train:])
accuracy_rf_wc = np.mean(pred_rf_wc == labels[train:])*100
print("Accuracy =",accuracy_rf_wc)

cm = confusion_matrix(labels[train:], pred_rf_wc)
cm

Accuracy = 40.38461538461539


array([[34,  0, 56],
       [20,  0, 38],
       [41,  0, 71]])

In [36]:
# AdaBoost Classifier for Word Count Features
clf_ada_wc = AdaBoostClassifier(n_estimators=100,learning_rate=1.0)
clf_ada_wc = clf_ada_wc.fit(word_count_features[:train], labels[:train])
pred_ada_wc = clf_ada_wc.predict(word_count_features[train:])
accuracy_ada_wc = np.mean(pred_ada_wc==labels[train:])*100
print("Accuracy =",accuracy_ada_wc)

cm = confusion_matrix(labels[train:], pred_ada_wc)
cm

Accuracy = 35.0


array([[37, 14, 39],
       [29,  3, 26],
       [49, 12, 51]])

### Tfidf Features

In [70]:
# Multiclass Logistic Classifier for Tfidf Features
clf_logistic_tfidf = LogisticRegression(solver = 'lbfgs', multi_class = 'multinomial', max_iter=500)
clf_logistic_tfidf = clf_logistic_tfidf.fit(tfidf_features[:train], labels[:train])
pred_logistic_tfidf = clf_logistic_tfidf.predict(tfidf_features[train:])
accuracy_logistic_tfidf = np.mean(pred_logistic_tfidf==labels[train:])*100
print("Accuracy =",accuracy_logistic_tfidf)

cm = confusion_matrix(labels[train:], pred_logistic_tfidf, normalize='all')
cm
# ConfusionMatrixDisplay(cm).plot()

Accuracy = 35.0


array([[0.14615385, 0.04230769, 0.15769231],
       [0.08846154, 0.        , 0.13461538],
       [0.18461538, 0.04230769, 0.20384615]])

In [71]:
# Mutliclass Naive Bayes Classifier for Tfidf Features
clf_nb_tfidf = GaussianNB()
clf_nb_tfidf = clf_nb_tfidf.fit(tfidf_features[:train], labels[:train])
pred_nb_tfidf = clf_nb_tfidf.predict(tfidf_features[train:])
accuracy_nb_tfidf = np.mean(pred_nb_tfidf==labels[train:])*100
print("Accuracy =",accuracy_nb_tfidf)

cm = confusion_matrix(labels[train:], pred_nb_tfidf, normalize='all')
cm

Accuracy = 32.30769230769231


array([[0.12692308, 0.07692308, 0.14230769],
       [0.13076923, 0.00384615, 0.08846154],
       [0.16538462, 0.07307692, 0.19230769]])

In [72]:
# Random Forest Classifier for Tfidf Features
clf_rf_tfidf = RandomForestClassifier(max_depth=5, random_state=0, n_estimators=300, criterion='gini')
clf_rf_tfidf = clf_rf_tfidf.fit(tfidf_features[:train], labels[:train])
pred_rf_tfidf = clf_rf_tfidf.predict(tfidf_features[train:])
accuracy_rf_tfidf = np.mean(pred_rf_tfidf==labels[train:])*100
print("Accuracy =",accuracy_rf_tfidf)

cm = confusion_matrix(labels[train:], pred_rf_tfidf, normalize='all')
cm
# ConfusionMatrixDisplay(cm).plot()

Accuracy = 43.84615384615385


array([[0.12692308, 0.        , 0.21923077],
       [0.09230769, 0.        , 0.13076923],
       [0.11923077, 0.        , 0.31153846]])

In [73]:
# AdaBoost Classifier for Tfidf Features
clf_ada_tfidf = AdaBoostClassifier(n_estimators=100,learning_rate=1.0)
clf_ada_tfidf = clf_ada_tfidf.fit(tfidf_features[:train], labels[:train])
pred_ada_tfidf = clf_ada_tfidf.predict(tfidf_features[train:])
accuracy_ada_tfidf = np.mean(pred_ada_tfidf==labels[train:])*100
print("Accuracy =",accuracy_ada_tfidf)

cm = confusion_matrix(labels[train:], pred_ada_tfidf, normalize='all')
# ConfusionMatrixDisplay(cm).plot()

Accuracy = 38.84615384615385


### Error Analysis

In [41]:
# pysqldf = lambda q: sqldf(q, globals())
# q = """SELECT text,
#        CASE WHEN label = 'Computer Science' THEN 0
#             WHEN label = 'Statistics' THEN 1
#             WHEN label = 'Math' THEN 2
#        END AS num_label
#        FROM test
#        """

# test = pysqldf(q)

# test['pred_logistic_tfidf'] = pred_logistic_tfidf
# test

In [61]:
testing

Unnamed: 0,text,label
600,The following content is\nprovided under a Cre...,Math
601,Let's keep building our table of\nLaplace tr...,Computer Science
602,Hello everyone. So far in the series on\ndata ...,Statistics
603,Say I have some matrix a --\nlet's say a is n ...,Computer Science
604,"In this lesson, we're going to write code\nto ...",Statistics
...,...,...
855,The following content is\nprovided under a Cre...,Math
856,&gt;&gt; [MUSIC] &gt;&gt; DAVID J. MALAN: All ...,Math
857,The following content is\nprovided by MIT Open...,Math
858,The following content is\nprovided under a Cre...,Computer Science


In [119]:
gold_label = labels[train:]
print(gold_label)
# len(gold_label) 260

[2, 0, 1, 0, 1, 2, 0, 2, 2, 2, 0, 2, 1, 0, 2, 0, 1, 0, 2, 0, 2, 2, 2, 2, 0, 1, 0, 1, 0, 0, 2, 1, 1, 0, 2, 0, 0, 0, 0, 2, 0, 2, 1, 2, 2, 2, 2, 0, 2, 0, 0, 1, 1, 1, 1, 0, 2, 0, 0, 0, 2, 2, 0, 1, 2, 1, 0, 0, 1, 1, 2, 0, 2, 2, 2, 2, 2, 1, 2, 0, 0, 0, 0, 1, 1, 0, 0, 1, 2, 0, 1, 0, 1, 2, 0, 2, 2, 1, 0, 2, 2, 2, 2, 1, 2, 0, 0, 0, 2, 0, 1, 0, 2, 0, 2, 2, 0, 1, 2, 2, 2, 0, 0, 1, 1, 0, 2, 1, 2, 1, 2, 1, 0, 2, 2, 2, 0, 0, 2, 2, 2, 0, 2, 1, 2, 2, 2, 0, 2, 2, 1, 2, 2, 1, 0, 2, 0, 0, 2, 1, 0, 1, 0, 2, 0, 2, 2, 0, 2, 2, 0, 2, 2, 2, 2, 2, 0, 1, 1, 0, 1, 2, 2, 2, 2, 0, 0, 2, 2, 0, 1, 1, 1, 2, 2, 0, 0, 0, 1, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1, 2, 0, 1, 2, 0, 0, 1, 0, 0, 0, 2, 1, 0, 1, 2, 1, 0, 0, 2, 1, 0, 2, 2, 1, 0, 2, 0, 1, 0, 1, 1, 1, 1, 2, 0, 2, 0, 2, 0, 0, 2, 0, 2, 2, 0, 2, 2, 2, 2, 0, 2]


260

In [102]:
pred_rf_tfidf

array([2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2,
       2, 2, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 0, 0, 2, 0, 0, 2, 2, 0, 0, 2,
       2, 2, 2, 2, 2, 0, 2, 0, 2, 2, 0, 0, 2, 0, 2, 2, 2, 2, 2, 2, 0, 0,
       2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 0, 2, 0, 0, 0, 2, 2, 2, 0, 2, 2, 0,
       0, 2, 2, 0, 0, 0, 0, 2, 2, 2, 2, 2, 0, 2, 0, 0, 0, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 0, 0, 2, 0, 2, 2, 2, 0, 2, 2, 2,
       2, 2, 2, 2, 2, 0, 2, 0, 2, 2, 0, 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0,
       0, 2, 2, 2, 2, 2, 2, 0, 2, 2, 0, 2, 2, 2, 2, 0, 2, 0, 2, 2, 0, 2,
       2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 0,
       2, 2, 2, 0, 2, 2, 2, 2, 0, 0, 2, 0, 2, 2, 2, 0, 2, 2, 2, 0, 0, 2,
       0, 0, 0, 0, 0, 0, 2, 2, 0, 2, 2, 0, 0, 0, 2, 2, 0, 0, 2, 2, 2, 2,
       2, 0, 2, 0, 2, 2, 0, 2, 2, 2, 2, 0, 2, 2, 0, 2, 2, 2])

In [79]:
pred_ada_tfidf

array([0, 2, 0, 0, 0, 2, 2, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 0, 2,
       0, 2, 2, 0, 0, 2, 1, 1, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 0, 2, 0,
       2, 2, 2, 0, 2, 1, 0, 2, 2, 0, 1, 0, 2, 0, 2, 0, 2, 2, 2, 1, 2, 0,
       2, 0, 0, 2, 0, 2, 2, 0, 2, 2, 2, 0, 0, 1, 0, 2, 0, 0, 0, 2, 2, 1,
       0, 2, 2, 2, 0, 0, 0, 1, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2, 0, 0, 2,
       2, 0, 2, 2, 2, 0, 0, 1, 0, 0, 2, 2, 0, 0, 2, 2, 0, 2, 0, 0, 0, 0,
       2, 1, 0, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 2, 0, 0, 2, 0, 0,
       1, 1, 0, 2, 2, 2, 2, 0, 1, 1, 0, 0, 2, 0, 2, 0, 2, 2, 1, 0, 0, 2,
       2, 2, 0, 2, 0, 2, 2, 2, 0, 0, 0, 2, 0, 0, 2, 1, 2, 2, 2, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 2, 2, 0, 0, 0, 2, 2, 0, 0, 2, 2, 2, 0, 0, 2,
       0, 2, 2, 0, 0, 2, 2, 1, 0, 1, 2, 2, 0, 0, 2, 2, 0, 0, 2, 0, 2, 0,
       1, 0, 2, 0, 2, 2, 0, 2, 0, 0, 0, 0, 2, 2, 2, 2, 0, 1])

In [121]:
# word count
for i in range(1,len(gold_label)):
    if pred_ada_wc[i] != pred_rf_wc[i] and pred_ada_wc[i] != pred_logistic_wc[i] and pred_rf_wc[i] != pred_logistic_wc[i]:
        print('index', i, '| ada -', pred_ada_wc[i], '| rf -', pred_rf_wc[i], '| logistic -', pred_logistic_wc[i], '| gold -', gold_label[i])


index 2 | ada - 1 | rf - 2 | logistic - 0 | gold - 1
index 12 | ada - 2 | rf - 0 | logistic - 1 | gold - 1
index 13 | ada - 2 | rf - 0 | logistic - 1 | gold - 0
index 33 | ada - 2 | rf - 0 | logistic - 1 | gold - 0
index 79 | ada - 0 | rf - 2 | logistic - 1 | gold - 0
index 96 | ada - 0 | rf - 2 | logistic - 1 | gold - 2
index 107 | ada - 0 | rf - 2 | logistic - 1 | gold - 0
index 108 | ada - 0 | rf - 2 | logistic - 1 | gold - 2
index 111 | ada - 1 | rf - 2 | logistic - 0 | gold - 0
index 122 | ada - 1 | rf - 2 | logistic - 0 | gold - 0
index 153 | ada - 1 | rf - 2 | logistic - 0 | gold - 1
index 154 | ada - 1 | rf - 0 | logistic - 2 | gold - 0
index 163 | ada - 2 | rf - 0 | logistic - 1 | gold - 2
index 165 | ada - 0 | rf - 2 | logistic - 1 | gold - 2
index 169 | ada - 1 | rf - 0 | logistic - 2 | gold - 2
index 185 | ada - 0 | rf - 2 | logistic - 1 | gold - 0
index 194 | ada - 0 | rf - 2 | logistic - 1 | gold - 2
index 208 | ada - 0 | rf - 2 | logistic - 1 | gold - 1
index 239 | ada -

In [122]:
# tfidf
for i in range(1,len(gold_label)):
    if pred_ada_tfidf[i] != pred_rf_tfidf[i] and pred_ada_tfidf[i] != pred_logistic_tfidf[i] and pred_rf_tfidf[i] != pred_logistic_tfidf[i]:
        print('index', i, '| ada -', pred_ada_tfidf[i], '| rf -', pred_rf_tfidf[i], '| logistic -', pred_logistic_tfidf[i], '| gold -', gold_label[i])


index 63 | ada - 1 | rf - 2 | logistic - 0 | gold - 1
index 95 | ada - 1 | rf - 2 | logistic - 0 | gold - 2
index 96 | ada - 0 | rf - 2 | logistic - 1 | gold - 2
index 98 | ada - 0 | rf - 2 | logistic - 1 | gold - 0
index 107 | ada - 0 | rf - 2 | logistic - 1 | gold - 0
index 108 | ada - 0 | rf - 2 | logistic - 1 | gold - 2
index 144 | ada - 0 | rf - 2 | logistic - 1 | gold - 2
index 154 | ada - 1 | rf - 0 | logistic - 2 | gold - 0
index 163 | ada - 1 | rf - 2 | logistic - 0 | gold - 2
index 227 | ada - 1 | rf - 2 | logistic - 0 | gold - 2


In [179]:
subtitles[753]

'video little formal defining vector vector addition scalar multiplication video want kind basics lot examples tangible sense vectors operate let define couple vectors going vectors going video going easy draw remember set tuples ordered tuples numbers know looks like comma real numbers member reals member reals sense means right coordinate axes wanted plot know view coordinate imagine axis second coordinate plotted vertical axis traditionally axis second number axis visually represent literally single point plane continue infinity direction points number lines immediately kind bigger space said wouldn abstract examples let vectors going let define vector nice bold vector equal numbers negative vector nice bold let don know vectors let add based definition vector addition stay color don switching forth nice deep plus bolded equal add terms negative plus plus definition vector addition going equal fair came definition vector addition represent vector know coordinates know coordinate con

In [180]:
df.iloc[753,0]

"In the last video I was a little\nformal in defining what Rn is, and what a vector is,\nand what vector addition or scalar multiplication is. In this video I want to kind of\ngo back to basics and just give you a lot of examples. And give you a more tangible\nsense for what vectors are and how we operate with them. So let me define a couple\nof vectors here. And I'm going to do, most of my\nvectors I'm going to do in this video are going\nto be in R2. And that's because they're\neasy to draw. Remember R2 is the set\nof all 2-tuples. Ordered 2-tuples where each of\nthe numbers, so you know you could have x1, my 1 looks like a\ncomma, x1 and x2, where each of these are real numbers. So you each of them, x1 is a\nmember of the reals, and x2 is a member of the reals. And just to give you a sense\nof what that means, if this right here is my coordinate\naxes, and I wanted a plot all my x1's, x2's. You know you could view this\nas the first coordinate. We always imagine that\nas our x-axis.

In [171]:
df[df['text'].str.contains('divide and conquer')]

Unnamed: 0,text,label
13,The following content is\nprovided under a Cre...,Statistics
16,The following content is\nprovided under a Cre...,Computer Science
36,The following content is\nprovided under a Cre...,Math
49,The following content is\nprovided under a Cre...,Math
52,The following\ncontent is provided under a Cre...,Statistics
56,OPERATOR: The following content\nis provided u...,Math
67,NARRATOR: The following content\nis provided u...,Statistics
104,The following content is\nprovided under a Cre...,Statistics
126,The following content is\nprovided under a Cre...,Statistics
141,Let us now revisit the second\ncalculation tha...,Statistics


In [183]:
pysqldf = lambda q: sqldf(q, globals())
q = """SELECT label, count(*)
       FROM df
       where text like '%divide and conquer%' 
       group by label
       """

pysqldf(q)

Unnamed: 0,label,count(*)
0,Computer Science,13
1,Math,14
2,Statistics,10


In [182]:
pysqldf = lambda q: sqldf(q, globals())
q = """SELECT label, count(*)
       FROM df
       where text like '%distribution%' 
       group by label
       """

pysqldf(q)

Unnamed: 0,label,count(*)
0,Computer Science,60
1,Math,75
2,Statistics,45
