In [1]:
import numpy as np
import pandas as pd

from gensim.models import KeyedVectors
from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_punctuation, strip_multiple_whitespaces, strip_numeric, remove_stopwords, strip_short

import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

from tqdm import tqdm
tqdm.pandas()

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sunil\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
df = pd.read_csv('akut.csv')
df['Derived Generic Category'].unique()

array(['Assignment', 'Quiz', 'Homework', 'Test', 'Extra Credit'],
      dtype=object)

In [3]:
df.groupby(['Derived Generic Category']).count()

Unnamed: 0_level_0,Teacher,School,Assignment Name,School Category
Derived Generic Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Assignment,605,605,605,605
Extra Credit,37,37,37,37
Homework,327,327,327,327
Quiz,145,145,145,145
Test,102,102,102,102


In [4]:
txt_filters = [lambda x: x.lower(), strip_tags, strip_punctuation, strip_multiple_whitespaces, strip_numeric, remove_stopwords, strip_short]

def process_input(row):
    input_merged = row['Assignment Name'] + ' ' + row['School Category']
    
    # gensim's preprocess_string through series of txt_filters which generates tokens array
    input_processed_tokens = " ".join(preprocess_string(input_merged, txt_filters))
    
    # input_processed_tokens is deduplicated to form final input string
    #input_processed = " ".join(sorted(set(input_processed_tokens), key=input_processed_tokens.index))
    return input_processed_tokens
    
df['processed_input'] = df.apply(lambda row: process_input(row), axis=1)

In [5]:
# Simple check to see as how many records have their LABEL present as-is in Assignment Text or School Category
# 738 of 1216 are that simple
'''
simple = 0
txt_data = df['Assignment Name'] + ' ' + df['School Category']

for i in range(1, df.shape[0]):
    #print(df['Derived Generic Category'][i], txt_data[i])
    if df['Derived Generic Category'][i].lower() in txt_data[i].lower():
        simple += 1
    #else:
        #print(df['Derived Generic Category'][i], txt_data[i])

print(simple)
'''

"\nsimple = 0\ntxt_data = df['Assignment Name'] + ' ' + df['School Category']\n\nfor i in range(1, df.shape[0]):\n    #print(df['Derived Generic Category'][i], txt_data[i])\n    if df['Derived Generic Category'][i].lower() in txt_data[i].lower():\n        simple += 1\n    #else:\n        #print(df['Derived Generic Category'][i], txt_data[i])\n\nprint(simple)\n"

In [6]:
# 0th index based 6th is this new 'label' column
df['label'] = pd.factorize(df['Derived Generic Category'])[0]

X_train, X_test, y_train, y_test = train_test_split(df['processed_input'], df['label'], test_size=0.1)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)

clf = MultinomialNB().fit(X_train_counts, y_train)
# -vs-
# 1188 correct out of 1216
#tfidf_transformer = TfidfTransformer()
#X_train_tfidf = count_vect.fit_transform(X_train_counts)
#clf = MultinomialNB().fit(X_train_counts, y_train)

In [7]:
predicted_label = clf.predict(count_vect.transform(df['processed_input']))

In [8]:
for i in range(0, df.shape[0]):
    if predicted_label[i] != df['label'][i]:
        print(i, predicted_label[i], df['processed_input'][i], df['label'][i])

48 1 religions venn diagram worksheets 0
70 1 major religion group projects group grade 0
229 1 test tests quizzes 3
731 2 caravana reading 0
916 2 post movie prompt essays 0
976 2 reading reading 0
977 2 reading reading 0
1011 2 september worksheet worksheets 0


In [9]:
score_truth = (predicted_label == df['label'].values)
score_truth.sum()

1208