In [1]:
import csv

#read file
with open("movie1.csv", "r", encoding='utf-8') as file:
    reader = csv.reader(file)
    lines = []

    for i, row in enumerate(reader):
        if i == 0:
            continue

        lines.append(row)

lines


[['5296',
  'The original Road House was a classic cheesy 80s movie, which although it didn\'t have anywhere near award worthy writing or acting, was a very enjoyable and popular film, largely due to the presence of star Patrick Swayze and the great supporting cast, along with some excellent fight scenes and eye candy.<br /><br />16 years later, and MGM / Sony attempts to re-create the magic which left us all quoting one liners and reciting the three rules of bouncing... with a movie which quotes all the original\'s best one liners and recites the three rules.<br /><br />Were this an amateur fan made film, it would be seen as a loving homage to one of the most popular of Swayze\'s movies. As a professionally made film, it falls flat on it\'s face right into the DVD Bargin Bin, with its continual reuse of lines and plot from the original movie becoming more of an annoying sign of lack of originality rather than cool references to the original.<br /><br />Having said that, with new lines

In [2]:
# Масив відгуків
reviews = [line[1] for line in lines]
reviews[:5]

['The original Road House was a classic cheesy 80s movie, which although it didn\'t have anywhere near award worthy writing or acting, was a very enjoyable and popular film, largely due to the presence of star Patrick Swayze and the great supporting cast, along with some excellent fight scenes and eye candy.<br /><br />16 years later, and MGM / Sony attempts to re-create the magic which left us all quoting one liners and reciting the three rules of bouncing... with a movie which quotes all the original\'s best one liners and recites the three rules.<br /><br />Were this an amateur fan made film, it would be seen as a loving homage to one of the most popular of Swayze\'s movies. As a professionally made film, it falls flat on it\'s face right into the DVD Bargin Bin, with its continual reuse of lines and plot from the original movie becoming more of an annoying sign of lack of originality rather than cool references to the original.<br /><br />Having said that, with new lines such as "I

In [3]:
import nltk 
import numpy as np
import re

wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc):
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A) 
    doc = doc.lower()
    doc = doc.strip()
    tokens = wpt.tokenize(doc)
    filtered_tokens = [token for token in tokens if token not in stop_words]
    doc = ' '.join(filtered_tokens)
    return doc

normalize_corpus = np.vectorize(normalize_document)
reviews = normalize_corpus(reviews)

reviews[:5]

array(['original road house classic cheesy movie although didnt anywhere near award worthy writing acting enjoyable popular film largely due presence star patrick swayze great supporting cast along excellent fight scenes eye candybr br years later mgm sony attempts recreate magic left us quoting one liners reciting three rules bouncing movie quotes originals best one liners recites three rulesbr br amateur fan made film would seen loving homage one popular swayzes movies professionally made film falls flat face right dvd bargin bin continual reuse lines plot original movie becoming annoying sign lack originality rather cool references originalbr br said new lines im gonna kill like killed father wonder screenwriters decided rehash much original scriptbr br knew never going anything special straight dvd sequel least hoped might couple new ideas fresh things included live road house name get simply remake film little narcotics added inbr br director id removed references first film tarni

In [4]:
# масив настроїв відповідних відгуків

rates = [int(line[2]) for line in lines]
reviews_rates = [(rev, rat) for rev,rat in zip(reviews, rates)]
reviews_rates[:5]

[('original road house classic cheesy movie although didnt anywhere near award worthy writing acting enjoyable popular film largely due presence star patrick swayze great supporting cast along excellent fight scenes eye candybr br years later mgm sony attempts recreate magic left us quoting one liners reciting three rules bouncing movie quotes originals best one liners recites three rulesbr br amateur fan made film would seen loving homage one popular swayzes movies professionally made film falls flat face right dvd bargin bin continual reuse lines plot original movie becoming annoying sign lack originality rather cool references originalbr br said new lines im gonna kill like killed father wonder screenwriters decided rehash much original scriptbr br knew never going anything special straight dvd sequel least hoped might couple new ideas fresh things included live road house name get simply remake film little narcotics added inbr br director id removed references first film tarnish or

In [5]:
from sklearn.model_selection import train_test_split

#поділ на вибірки
X_train, X_test, Y_train, Y_test = train_test_split(reviews, rates, test_size=0.3, random_state=5)

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

# naive bayes
cv = CountVectorizer()
X_train = cv.fit_transform(X_train)
X_test = cv.transform(X_test)

mnb = MultinomialNB()
mnb.fit(X_train,Y_train)

MultinomialNB()

In [7]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix

mnb_score = mnb.score(X_test, Y_test)
mnb_confusion_matrix = confusion_matrix(Y_test, mnb.predict(X_test))

print(f"MNB Score: {mnb_score:.3}")
print(f"MNB Confusion Matrix:\n{mnb_confusion_matrix}")

MNB Score: 0.848
MNB Confusion Matrix:
[[2628  393]
 [ 520 2459]]


In [8]:
import textblob

# text blob
def text_blob_rate(review: str) -> int:
    tb = textblob.TextBlob(review)
    polarity = tb.sentiment.polarity
    return 1 if polarity >= 0.1 else 0

In [13]:
from sklearn.metrics import accuracy_score

predicted_scores = []

for i, (review, rate) in enumerate(zip(reviews, rates)):
    predicted_scores.append(text_blob_rate(review))

textblob_matrix = confusion_matrix(rates, predicted_scores)
textblob_score = accuracy_score(rates, predicted_scores)

print("Confusion matrix:")
print(textblob_matrix)

print(f"\nScore: {textblob_score}")

Confusion matrix:
[[7614 2395]
 [2390 7601]]

Score: 0.76075


In [10]:
# testing
test_reviews = ["A young girl becomes a war-time marine's pen-pal, and when he visits at war's end expecting someone a bit more ""available,"" comic complications ensue. All ultimately works out well, naturally, but not before everyone involved has thoroughly chewed the scenery. Errol Flynn's dead-on impression of Humphrey Bogart from ""Casablanca"" is a highlight, as are various send-ups of his own swashbuckling image (the ""jumping"" scene in the kitchen with Forrest Tucker is a riot). It is Tucker, though, who ""tucks"" the movie under his arm, lowers his head and barrels over the goal line. He demonstrates the comic flair more fully developed twenty years later in ""F-Troop"" and imparts a liveliness and energy that Flynn repeatedly plays off to raise his own performance. Eleanor Parker does a fine job as the woman being pursued, and little Patti Brady charms as Tucker's actual pen-pal friend. A fine, lightweight ""coming home"" comedy in a genteel setting that children and romantics of all ages should find entertaining.",
                "This show was an amazing, fresh & innovative idea in the 70's when it first aired. The first 7 or 8 years were brilliant, but things dropped off after that. By 1990, the show was not really funny anymore, and it's continued its decline further to the complete waste of time it is today.<br /><br />It's truly disgraceful how far this show has fallen. The writing is painfully bad, the performances are almost as bad - if not for the mildly entertaining respite of the guest-hosts, this show probably wouldn't still be on the air. I find it so hard to believe that the same creator that hand-selected the original cast also chose the band of hacks that followed. How can one recognize such brilliance and then see fit to replace it with such mediocrity? I felt I must give 2 stars out of respect for the original cast that made this show such a huge success. As it is now, the show is just awful. I can't believe it's still on the air.",
                "Cheesy script, cheesy one-liners. Timothy Hutton's performance a ""little"" over the top. David Duchovny still seemed to be stuck in his Fox Mulder mode. No chemistry with his large-lipped female co-star.He needs Gillian Anderson to shine. He does not seem to have any talent of his own.",
                "A LAUREL & HARDY Comedy Short. The Boys arrive to sweep the chimneys at the home of Professor Noodle, a mad scientist who's just perfected his rejuvenation serum. Stan & Ollie proceed with their DIRTY WORK, spreading destruction inside the house and on the roof. Then the Professor wants to try out his new potion...<br /><br />A very funny little film. The ending is a bit abrupt, but much of the slapstick leading up to it is terrific. Especially good is Stan & Ollie's contest of wills at opposite ends of the chimney. That's Lucien Littlefield as the Professor."]

test_reviews_rates = [1, 0, 0, 1]

In [11]:
# with TextBlob
text_blob_results = []

for i in range(len(test_reviews_rates)):
    tb_res = text_blob_rate(test_reviews[i])
    print(f"Test review {i+1} result: {tb_res} (actual: {test_reviews_rates[i]})")
    text_blob_results.append(tb_res)

print(f"TextBlob Accuracy: {accuracy_score(test_reviews_rates, text_blob_results)}")

Test review 1 result: 1 (actual: 1)
Test review 2 result: 0 (actual: 0)
Test review 3 result: 0 (actual: 0)
Test review 4 result: 0 (actual: 1)
TextBlob Accuracy: 0.75


In [12]:
# with Sklearn
sklearn_results = mnb.predict(cv.transform(test_reviews))
print(f"MNB predict: {sklearn_results}")
print(f"Actual: {test_reviews_rates}")
print(f"Sklearn MultinomialNB Accuracy: {accuracy_score(test_reviews_rates, sklearn_results)}")

MNB predict: [1 0 1 1]
Actual: [1, 0, 0, 1]
Sklearn MultinomialNB Accuracy: 0.75
