In [13]:
import pandas as pd
from sklearn.linear_model import LogisticRegression

def calculatewordfreq(words):
    word_freq = {}
    for word in words:
        if word in word_freq:
            word_freq[word] += 1
        else:
            word_freq[word] = 1

    return word_freq



def calculate_ngrams(docs, n):
    ngram_list = []
    
    # Iterate over each document in the input list
    for doc in docs:
        
        words = doc.split()
        doc_ngrams = []
        
        # Generate n-grams for the current document
        for i in range(len(words) - n + 1):
            n_gram = words[i:i + n]
            n_gram_tuple = tuple(n_gram)
            doc_ngrams.append(n_gram_tuple)
        
        ngram_list.append(doc_ngrams)
    
    # Return the list of n-grams for each document
    return ngram_list


def ngrams_to_vector(ngrams, vocabulary):

    vector = []
    # Iterate over each document in the ngrams list
    for doc in ngrams:
        # Calculate the word frequency for the current document
        doc_counts = calculatewordfreq(doc)
        
        # Create a vector representation for the current document based on the vocabulary
        doc_vector = []
        for term in vocabulary:
            # Get the count of the current term in the document; if not found, default to 0
            count = doc_counts.get(term, 0)
            doc_vector.append(count)
        
        # Append the document vector to the overall vector list
        vector.append(doc_vector)
    
    # Return the vector representation of all documents
    return vector

In [14]:
data = pd.read_csv('/Users/swastikagarwal/Downloads/PERSONAL/sem 5 codes/NLP/Musical_instruments_reviews 4.csv')
x = data.iloc[:, 4].values
y = data.iloc[:, 5].values

# Preprocessing y values
for i in range(len(y)):
    if(y[i] == 4 or y[i] == 5):
        y[i] = 1
    elif(y[i] == 3):
        y[i] = 0
    else:
        y[i] = -1

# Splitting data for balanced classes
x2 = []
y2 = []
c = {-1: 0, 0: 0, 1: 0}
for z in range(len(x)):
    if(c[y[z]] <= 467):
        x2.append(x[z])
        c[y[z]] += 1
        y2.append(y[z])

x = x2
y = y2

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

n = int(input("Enter the value of n for n-grams: "))
xtrain_ngrams = calculate_ngrams(x_train, n)
xtest_ngrams = calculate_ngrams(x_test, n)

vocabulary = set()
# Iterate over each document in the xtrain_ngrams list
for doc in xtrain_ngrams:
    # Iterate over each n-gram in the current document
    for gram in doc:
        # Add the n-gram to the set (if it's not already present)
        vocabulary.add(gram)


xtrain_vector = ngrams_to_vector(xtrain_ngrams, vocabulary)
xtest_vector = ngrams_to_vector(xtest_ngrams, vocabulary)



In [15]:
from sklearn.naive_bayes import GaussianNB
multinomial_nb = MultinomialNB()

# Training the classifier
multinomial_nb.fit(xtrain_vector, y_train)

# Making predictions on the test set
y_pred = multinomial_nb.predict(xtest_vector)
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.4804270462633452


In [16]:
review = "not great product but useful buy this"
review_ngrams = calculate_ngrams([review], n)
review_vector = ngrams_to_vector(review_ngrams, vocabulary)
res = multinomial_nb.predict(review_vector)
print(res[0])  # Prediction for the review

1.0


In [17]:
review = "best product highly recommended"
review_ngrams = calculate_ngrams([review], n)
review_vector = ngrams_to_vector(review_ngrams, vocabulary)
res = multinomial_nb.predict(review_vector)
print(res[0])  # Prediction for the review

1.0


In [18]:
review = "worst product highly not recommended"
review_ngrams = calculate_ngrams([review], n)
review_vector = ngrams_to_vector(review_ngrams, vocabulary)
res = multinomial_nb.predict(review_vector)
print(res[0])  # Prediction for the review

1.0
