In [3]:
import datasets as ds
import string
import math
import numpy as np

## The Dataset

In [4]:
ds.get_dataset_split_names("imdb")

['train', 'test', 'unsupervised']

### 1. How many splits does the dataset has?

    The dataset is composed of 3 splits : train, test and unsupervised

In [5]:
ds_train = ds.load_dataset("imdb", split="train")
ds_train

Found cached dataset imdb (/home/ethan/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)


Dataset({
    features: ['text', 'label'],
    num_rows: 25000
})

In [6]:
ds_test = ds.load_dataset("imdb", split="test")
ds_test

Found cached dataset imdb (/home/ethan/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)


Dataset({
    features: ['text', 'label'],
    num_rows: 25000
})

In [7]:
ds_unsupervised = ds.load_dataset("imdb", split="unsupervised")
ds_unsupervised

Found cached dataset imdb (/home/ethan/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)


Dataset({
    features: ['text', 'label'],
    num_rows: 50000
})

### 2. How big are these splits? 

    * train : 25000 	

    * test : 25000 	

    * unsupervised : 50000

In [8]:
ds_test.filter(lambda d: d['label'] == 1)

Loading cached processed dataset at /home/ethan/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-568243c224eb1160.arrow


Dataset({
    features: ['text', 'label'],
    num_rows: 12500
})

In [9]:
ds_train.filter(lambda d: d['label'] == 1)

Loading cached processed dataset at /home/ethan/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-d036cc1248b6b6bf.arrow


Dataset({
    features: ['text', 'label'],
    num_rows: 12500
})

### 3. What is the proportion of each class on the supervised splits?

    * train : 12500 negatives and 12500 positives

    * test : 12500 negatives adn 12500 positives

## Naive Bayes classifier

### 1.

In [10]:
def preprocessing(base_text: str):
  """
  Preprocess the text before classification
  Args:
    base_text: the string to preprocess
  Return:
    The preprocessed text
  """
  base_text = base_text.lower()
  base_text.replace("<br />",' ')
  text = ""
  ponct = string.punctuation
  for char in base_text:
    if char in ponct:
      text += ' '
    else:
      text += char
  return text

vectorized_preprocessing = np.vectorize(preprocessing) # vectorizing the function to be faster using numpy

### 2.

In [11]:
def flatten(l):
    return [item for sublist in l for item in sublist]

def naive_bayes_classifier(document, preprocessed_doc, classes, preprocessed_classes):
  """
  Execute a naive Bayes Classifier on document
  Args:
    document: the complete dataset from imdb
    classes: list of the element of the dataset separated between positives and negatives
  Return:
    logprior: probability of each class in the dataset in a list
    loglikelihood: probabilities of words to belong to a certain class in a list
    v: the vocabulary of the dataset
  """
  ndoc = document.num_rows
  words_by_texts = flatten([text.split() for text in preprocessed_doc])
  v = list(set(words_by_texts)) # list of all the words
  logprior = []
  loglikelihood = [[], []]
  current_class = 0

  for i in range(len(classes)):
    c = classes[i]
    nc = c.num_rows
    logprior.append(math.log(nc / ndoc))
    bigdoc = flatten([text.split() for text in preprocessed_classes[i]])

    # creating an histogram
    histo = {}
    for word in bigdoc:
      if word in histo:
        histo[word] += 1
      else:
        histo[word] = 1

    for voc_word in v:
      if voc_word in histo:
        count = histo[voc_word]
        loglikelihood[current_class].append(math.log((count+1)/(len(bigdoc) + len(v))))
      else:
        loglikelihood[current_class].append(math.log(1/(len(bigdoc) + len(v))))
    current_class+=1

  return logprior,loglikelihood,v

In [12]:
classes_preprocessed = vectorized_preprocessing(np.array([ds_train.filter(lambda d: d['label'] == 0)['text'],  ds_train.filter(lambda d: d['label'] == 1)['text']]))
document_preprocessed = vectorized_preprocessing(np.array(ds_train['text']))

Loading cached processed dataset at /home/ethan/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-d5f51b718c020c21.arrow
Loading cached processed dataset at /home/ethan/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-d036cc1248b6b6bf.arrow


In [13]:
classes = [ds_train.filter(lambda d: d['label'] == 0),  ds_train.filter(lambda d: d['label'] == 1)]
logprior, loglikelihood, v = naive_bayes_classifier(ds_train, document_preprocessed, classes, classes_preprocessed)
loglikelihood_dictionnarry = {}
for i in range(len(v)):
  loglikelihood_dictionnarry[v[i]] = (loglikelihood[0][i], loglikelihood[1][i])

Loading cached processed dataset at /home/ethan/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-d5f51b718c020c21.arrow
Loading cached processed dataset at /home/ethan/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-d036cc1248b6b6bf.arrow


In [14]:
import numpy as np
def get_value(x):
    if x in loglikelihood_dictionnarry:
      return (loglikelihood_dictionnarry[x][0], loglikelihood_dictionnarry[x][1])
    return (0,0)
get_values = np.vectorize(get_value)
def testbayes(doc, logprior, loglikelihood_dictionnarry, preprocess = True):
  """
  Predict a class with the naive bayes classifier
  Args:
    doc: the string to classify
    logprior: probability of each class in the dataset in a list
    loglikelihood: probabilities of words to belong to a certain class in a list
    classes: list of the element of the dataset separated between positives and negatives
    v: the vocabulary of the dataset
  Return:
    The predicted class of the string 'doc'
  """
  sum0, sum1 = (logprior[0], logprior[1])
  splitted_doc = np.array(doc.split())
  sum_values = get_values(splitted_doc)
  sum0, sum1 = sum0 + sum_values[0].sum(), sum1 + sum_values[1].sum()
  return 0 if sum0 > sum1 else 1

def test_bayes_list(x, logprior, loglikelihood_dictionnarry):
  return np.array([testbayes(xi, logprior, loglikelihood_dictionnarry, preprocess = False) for xi in x])

In [15]:
classes = vectorized_preprocessing(np.array([ds_test.filter(lambda d: d['label'] == 0)['text'],  ds_test.filter(lambda d: d['label'] == 1)['text']]))
print("negatives (should be 0)")
print(testbayes(classes[0][0],logprior,loglikelihood_dictionnarry))
print(testbayes(classes[0][1],logprior,loglikelihood_dictionnarry))
print(testbayes(classes[0][2],logprior,loglikelihood_dictionnarry))
print(testbayes(classes[0][100],logprior,loglikelihood_dictionnarry))
print(testbayes(classes[0][50],logprior,loglikelihood_dictionnarry))
print(testbayes(classes[0][200],logprior,loglikelihood_dictionnarry))
print(testbayes(classes[0][250],logprior,loglikelihood_dictionnarry))
print("positive (should be 1)")
print(testbayes(classes[1][0],logprior,loglikelihood_dictionnarry))
print(testbayes(classes[1][1],logprior,loglikelihood_dictionnarry))
print(testbayes(classes[1][2],logprior,loglikelihood_dictionnarry))
print(testbayes(classes[1][100],logprior,loglikelihood_dictionnarry))
print(testbayes(classes[1][50],logprior,loglikelihood_dictionnarry))
print(testbayes(classes[1][200],logprior,loglikelihood_dictionnarry))
print(testbayes(classes[1][250],logprior,loglikelihood_dictionnarry))

Loading cached processed dataset at /home/ethan/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-b72ebb77c49bacbc.arrow
Loading cached processed dataset at /home/ethan/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-568243c224eb1160.arrow


negatives (should be 0)
0
0
0
1
0
0
0
positive (should be 1)
1
1
1
1
1
1
1


### 3.

In [16]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
import numpy as np

sklearn_bayes = make_pipeline(CountVectorizer(), MultinomialNB())
sklearn_bayes

In [17]:
model = sklearn_bayes.fit(X=vectorized_preprocessing(np.array(ds_train['text'])),  y=np.array(ds_train['label']))

In [18]:
classes = vectorized_preprocessing(np.array([ds_test.filter(lambda d: d['label'] == 0)['text'],  ds_test.filter(lambda d: d['label'] == 1)['text']]))
print("negatives (should be 0)")
print(model.predict([classes[0][0]]))
print(model.predict([classes[0][1]]))
print(model.predict([classes[0][2]]))
print(model.predict([classes[0][100]]))
print(model.predict([classes[0][50]]))
print(model.predict([classes[0][200]]))
print(model.predict([classes[0][250]]))
print("positive (should be 1)")
print(model.predict([classes[1][0]]))
print(model.predict([classes[1][1]]))
print(model.predict([classes[1][2]]))
print(model.predict([classes[1][100]]))
print(model.predict([classes[1][50]]))
print(model.predict([classes[1][200]]))
print(model.predict([classes[1][250]]))

Loading cached processed dataset at /home/ethan/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-b72ebb77c49bacbc.arrow
Loading cached processed dataset at /home/ethan/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-568243c224eb1160.arrow


negatives (should be 0)
[0]
[0]
[0]
[1]
[0]
[0]
[0]
positive (should be 1)
[1]
[1]
[1]
[1]
[1]
[1]
[1]


### 4.

In [19]:
X_train = vectorized_preprocessing(np.array(ds_train['text']))
Y_train = np.array(ds_train['label'])

X_test = vectorized_preprocessing(np.array(ds_test['text']))
Y_test = np.array(ds_test['label'])

In [20]:
y_train_pred_student = test_bayes_list(X_train, logprior, loglikelihood_dictionnarry)
y_train_pred_scikit = model.predict(X_train)
y_test_pred_student = test_bayes_list(X_test, logprior, loglikelihood_dictionnarry)
y_test_pred_scikit = model.predict(X_test)

In [21]:
import sklearn.metrics as metrics

train_accurracy_student = metrics.accuracy_score(Y_train, y_train_pred_student)
train_accurracy_scikit = metrics.accuracy_score(Y_train, y_train_pred_scikit)
test_accurracy_student = metrics.accuracy_score(Y_test, y_test_pred_student)
test_accurracy_scikit = metrics.accuracy_score(Y_test, y_test_pred_scikit)

print(f"Accuracy student: train: {train_accurracy_student}, test: {test_accurracy_student}")
print(f"Accuracy scikit: train: {train_accurracy_scikit}, test: {test_accurracy_scikit}")

Accuracy student: train: 0.89364, test: 0.80924
Accuracy scikit: train: 0.89808, test: 0.8136


### 5. Why does scikit-learn implementation have a better Accuracy ?
    
    The slight difference in accuracy is probably due to scikit-learn using the accuracy value during training to have a better model.

### 6. Why is accuracy a sufficient measure of evaluation here ?

    With imbalanced data-sets, accuracy is not a good measure, since it does not distinguish between the numbers of correctly classified examples of different classes. In this case, the classes are perfectly balanced, so the accuracy gives a good insight of the model's reliability.

7.

In [22]:
Error = []
for i in range(len(X_test)):
  pred = model.predict([X_test[i]])[0]
  if pred != Y_test[i]:
    Error.append((pred, ds_test[i]['text']))
  if len(Error) == 2:
    break

In [23]:
Error[0]

(1,
 "Blind Date (Columbia Pictures, 1934), was a decent film, but I have a few issues with this film. First of all, I don't fault the actors in this film at all, but more or less, I have a problem with the script. Also, I understand that this film was made in the 1930's and people were looking to escape reality, but the script made Ann Sothern's character look weak. She kept going back and forth between suitors and I felt as though she should have stayed with Paul Kelly's character in the end. He truly did care about her and her family and would have done anything for her and he did by giving her up in the end to fickle Neil Hamilton who in my opinion was only out for a good time. Paul Kelly's character, although a workaholic was a man of integrity and truly loved Kitty (Ann Sothern) as opposed to Neil Hamilton, while he did like her a lot, I didn't see the depth of love that he had for her character. The production values were great, but the script could have used a little work.")

In [24]:
Error[1]

(1,
 "Ben, (Rupert Grint), is a deeply unhappy adolescent, the son of his unhappily married parents. His father, (Nicholas Farrell), is a vicar and his mother, (Laura Linney), is ... well, let's just say she's a somewhat hypocritical soldier in Jesus' army. It's only when he takes a summer job as an assistant to a foul-mouthed, eccentric, once-famous and now-forgotten actress Evie Walton, (Julie Walters), that he finally finds himself in true 'Harold and Maude' fashion. Of course, Evie is deeply unhappy herself and it's only when these two sad sacks find each other that they can put their mutual misery aside and hit the road to happiness.<br /><br />Of course it's corny and sentimental and very predictable but it has a hard side to it, too and Walters, who could sleep-walk her way through this sort of thing if she wanted, is excellent. It's when she puts the craziness to one side and finds the pathos in the character, (like hitting the bottle and throwing up in the sink), that she's at

Those two examples were predicted positives althought they were negatives. We should take a look a a true positive to figure out the similitudes.

In [25]:
ds_test.filter(lambda d: d['label'] == 1)[0]['text'].replace('<br />', ' ')

Loading cached processed dataset at /home/ethan/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-568243c224eb1160.arrow


"Previous reviewer Claudio Carvalho gave a much better recap of the film's plot details than I could. What I recall mostly is that it was just so beautiful, in every sense - emotionally, visually, editorially - just gorgeous.  If you like movies that are wonderful to look at, and also have emotional content to which that beauty is relevant, I think you will be glad to have seen this extraordinary and unusual work of art.  On a scale of 1 to 10, I'd give it about an 8.75. The only reason I shy away from 9 is that it is a mood piece. If you are in the mood for a really artistic, very romantic film, then it's a 10. I definitely think it's a must-see, but none of us can be in that mood all the time, so, overall, 8.75."

For the first one, we can say that it has been wrongly set as positive because the writer is conceding some good aspect to the film  even while he didn't liked it in then end. However, for the second one, The reviewer said things that happen positively during the film which has been considered by the model as a positive review of the film instead of a review of someone who didn't liked it. To sum up, in these two cases, there are many positives wortds which made the model classify them as positives reviews.

## Stemming and Lemmatization

### 1. We choose lemmatization.

In [26]:
import spacy

# loading the small English model
nlp = spacy.load("en_core_web_sm")

In [28]:
from nltk.stem import WordNetLemmatizer
  
lemmatizer = WordNetLemmatizer()

def preprocessing_lemma(base_text: str):
  """
  Preprocess the text before classification
  Args:
    base_text: the string to preprocess
  Return:
    The preprocessed text
  """
  base_text = base_text.lower()
  base_text.replace("<br />",' ')
  text = ""
  ponct = string.punctuation
  for char in base_text:
    if char in ponct:
      text += ' '
    else:
      text += char
  lemmas = [token.lemma_ for token in nlp(text.lower())]
  return " ".join(lemmas)

vectorized_preprocessing_lemma = np.vectorize(preprocessing_lemma)

### 2.

In [29]:
classes_preprocessed = vectorized_preprocessing_lemma(np.array([ds_train.filter(lambda d: d['label'] == 0)['text'],  ds_train.filter(lambda d: d['label'] == 1)['text']]))
document_preprocessed = vectorized_preprocessing_lemma(np.array(ds_train['text']))

Loading cached processed dataset at /home/ethan/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-d5f51b718c020c21.arrow
Loading cached processed dataset at /home/ethan/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-d036cc1248b6b6bf.arrow


In [30]:
classes = [ds_train.filter(lambda d: d['label'] == 0),  ds_train.filter(lambda d: d['label'] == 1)]
logprior, loglikelihood, v = naive_bayes_classifier(ds_train, document_preprocessed, classes, classes_preprocessed)
loglikelihood_dictionnarry = {}
for i in range(len(v)):
  loglikelihood_dictionnarry[v[i]] = (loglikelihood[0][i], loglikelihood[1][i])

Loading cached processed dataset at /home/ethan/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-d5f51b718c020c21.arrow
Loading cached processed dataset at /home/ethan/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-d036cc1248b6b6bf.arrow


In [31]:
X_train = document_preprocessed.copy()
Y_train = np.array(ds_train['label'])

X_test = vectorized_preprocessing_lemma(np.array(ds_test['text']))
Y_test = np.array(ds_test['label'])

In [32]:
y_train_pred_student = test_bayes_list(X_train, logprior, loglikelihood_dictionnarry)
y_test_pred_student = test_bayes_list(X_test, logprior, loglikelihood_dictionnarry)

In [33]:
train_accurracy_student = metrics.accuracy_score(Y_train, y_train_pred_student)
test_accurracy_student = metrics.accuracy_score(Y_test, y_test_pred_student)

print(f"Accuracy student: train: {train_accurracy_student}, test: {test_accurracy_student}")

Accuracy student: train: 0.88632, test: 0.80516


### 3. We can see that the accuracy obtained with the lemmatization is worse than without it, why ?
    
    The drop in the accurracy value is probably due to the loss of meanings, especially of tenses, due to lemmatization which does not work in favor of the naive bayes model. Indeed we can see that this loss of meaning (which in some contexts where the word order is respected can be beneficial) makes the model slightly less good.