In [37]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
import pickle
 
import re, string 
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

#for model-building
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score

# bag of words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

#for word embedding
import gensim
from gensim.models import Word2Vec

[nltk_data] Downloading package punkt to /Users/teddyoweh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/teddyoweh/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/teddyoweh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/teddyoweh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [38]:
cheat_path='./data/cheat.txt'
clean_path='./data/clean.txt'


In [39]:
from data import load_data, analyze
df_train = load_data(cheat_path,clean_path)

In [40]:
df_train

Unnamed: 0,text,target
0,Can you give me the answers to this homework a...,1
1,"I'm struggling with this test, can you help me...",1
2,Do you know where I can find the answers to th...,1
3,"I'm really stuck on this homework, can you jus...",1
4,Is there any way you can send me the answers t...,1
...,...,...
235,Have you started studying for midterms yet?,0
236,I'm thinking about getting involved in communi...,0
237,Have you started thinking about your post-grad...,0
238,I'm thinking about joining a study abroad prog...,0


In [41]:
x=df_train['target'].value_counts()

print(x)
df_train.isna().sum()


1    120
0    120
Name: target, dtype: int64


text      0
target    0
dtype: int64

In [42]:
analyze(df_train) # Mean Data
     

Word Count
Cheat Sentences:  18.6
Clean Sentences:  16.966666666666665

Character Count
Cheat Sentences:  89.44166666666666
Clean Sentences:  94.175

Unique Word Count
Cheat Sentences:  17.291666666666668
Clean Sentences:  16.166666666666668


In [43]:
text = "  test sd e%: , ?, ''  a     .  "

def preprocess(text):
    text = text.lower() 
    text=text.strip() 
    text=re.compile('<.*?>').sub('', text)  
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)  
    text = re.sub('\s+', ' ', text)   
    text = re.sub(r'\[[0-9]*\]',' ',text) 
    text=re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r'\d',' ',text)  
    text = re.sub(r'\s+',' ',text)  
    
    return text

text=preprocess(text)
print(text)   

test sd e a


In [44]:
def stopword(string):
    a= [i for i in string.split() if i not in stopwords.words('english')]
    return ' '.join(a)

text=stopword(text)
print(text)
 
    
snow = SnowballStemmer('english')
def stemming(string):
    a=[snow.stem(i) for i in word_tokenize(string) ]
    return " ".join(a)
text=stemming(text)
print(text)

 
wl = WordNetLemmatizer()
 
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def lemmatizer(string):
    word_pos_tags = nltk.pos_tag(word_tokenize(string)) 
    a=[wl.lemmatize(tag[0], get_wordnet_pos(tag[1])) for idx, tag in enumerate(word_pos_tags)]  
    return " ".join(a)

text = lemmatizer(text)
print(text)

test sd e
test sd e
test sd e


In [45]:
def finalpreprocess(string):
    return lemmatizer(stopword(preprocess(string)))

df_train['clean_text'] = df_train['text'].apply(lambda x: finalpreprocess(x))
df_train=df_train.drop(columns=['word_count','char_count','unique_word_count'])
df_train.head()

Unnamed: 0,text,target,clean_text
0,Can you give me the answers to this homework a...,1,give answer homework assignment
1,"I'm struggling with this test, can you help me...",1,struggle test help answer
2,Do you know where I can find the answers to th...,1,know find answer quiz
3,"I'm really stuck on this homework, can you jus...",1,really stuck homework give answer
4,Is there any way you can send me the answers t...,1,way send answer exam


In [46]:
df_train['clean_text_tok']=[nltk.word_tokenize(i) for i in df_train['clean_text']] 
model = Word2Vec(df_train['clean_text_tok'],min_count=1) 
 


w2v = dict(zip(model.wv.index_to_key, model.wv.vectors))  
#fix the syn0 error stuff
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
 
        self.dim = len(next(iter(word2vec.values())))

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])


In [47]:
X_train, X_val, y_train, y_val = train_test_split(df_train["clean_text"],
                                                  df_train["target"],
                                                  test_size=0.2,
                                                  shuffle=True)
X_train_tok= [nltk.word_tokenize(i) for i in X_train]  #for word2vec
X_val_tok= [nltk.word_tokenize(i) for i in X_val]      #for word2vec

 
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(X_train)  
X_val_vectors_tfidf = tfidf_vectorizer.transform(X_val)  
 
modelw = MeanEmbeddingVectorizer(w2v)
X_train_vectors_w2v = modelw.transform(X_train_tok)
X_val_vectors_w2v = modelw.transform(X_val_tok)
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)

In [48]:
lr_tfidf=LogisticRegression(solver = 'liblinear', C=10, penalty = 'l2')
lr_tfidf.fit(X_train_vectors_tfidf, y_train)  #model

 
y_predict = lr_tfidf.predict(X_val_vectors_tfidf)
y_prob = lr_tfidf.predict_proba(X_val_vectors_tfidf)[:,1]
 

print(classification_report(y_val,y_predict))
print('Confusion Matrix:',confusion_matrix(y_val, y_predict))
 
fpr, tpr, thresholds = roc_curve(y_val, y_prob)
roc_auc = auc(fpr, tpr)
print('AUC:', roc_auc)  

              precision    recall  f1-score   support

           0       1.00      0.96      0.98        25
           1       0.96      1.00      0.98        23

    accuracy                           0.98        48
   macro avg       0.98      0.98      0.98        48
weighted avg       0.98      0.98      0.98        48

Confusion Matrix: [[24  1]
 [ 0 23]]
AUC: 1.0


In [49]:
### goooodd resultss!!! naive bayes next

In [50]:
nb_tfidf = MultinomialNB()
nb_tfidf.fit(X_train_vectors_tfidf, y_train)  #model
#cheat_model
#Predict y value for test dataset
y_predict = nb_tfidf.predict(X_val_vectors_tfidf)
y_prob = nb_tfidf.predict_proba(X_val_vectors_tfidf)[:,1]
 

print(classification_report(y_val,y_predict))
print('Confusion Matrix:',confusion_matrix(y_val, y_predict))
 
fpr, tpr, thresholds = roc_curve(y_val, y_prob)
roc_auc = auc(fpr, tpr)
print('AUC:', roc_auc)  

              precision    recall  f1-score   support

           0       1.00      0.96      0.98        25
           1       0.96      1.00      0.98        23

    accuracy                           0.98        48
   macro avg       0.98      0.98      0.98        48
weighted avg       0.98      0.98      0.98        48

Confusion Matrix: [[24  1]
 [ 0 23]]
AUC: 1.0


In [51]:
#perfect

In [52]:
lr_w2v=LogisticRegression(solver = 'liblinear', C=10, penalty = 'l2')
lr_w2v.fit(X_train_vectors_w2v, y_train)  #model

 
y_predict = lr_w2v.predict(X_val_vectors_w2v)
y_prob = lr_w2v.predict_proba(X_val_vectors_w2v)[:,1]
 

print(classification_report(y_val,y_predict))
print('Confusion Matrix:',confusion_matrix(y_val, y_predict))
 
fpr, tpr, thresholds = roc_curve(y_val, y_prob)
roc_auc = auc(fpr, tpr)
print('AUC:', roc_auc)  

              precision    recall  f1-score   support

           0       0.96      1.00      0.98        25
           1       1.00      0.96      0.98        23

    accuracy                           0.98        48
   macro avg       0.98      0.98      0.98        48
weighted avg       0.98      0.98      0.98        48

Confusion Matrix: [[25  0]
 [ 1 22]]
AUC: 0.9965217391304348


In [53]:
#naive bayes is best
cheat_model=lr_tfidf

In [54]:

def predict_sentence(sentence):
    sentence = finalpreprocess(sentence)  
    X_vector = tfidf_vectorizer.transform([sentence])  # convert the input sentence to a vector
    y_predict = cheat_model.predict(X_vector)  # use the trained model to make a prediction
    y_prob = cheat_model.predict_proba(X_vector)[:, 1]  # get the probability of the prediction
    return y_predict, y_prob


In [55]:
predict_sentence('how do i pass the test')
# using 0.6 as the baseline

(array([0]), array([0.44715957]))

In [56]:
from data import save_model

In [57]:
save_model(cheat_model)