In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
import brotli
import networkx as nx
from scipy.sparse import csr_matrix, hstack
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn, stopwords
import string
import nltk
from itertools import combinations
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\moham\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\moham\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    words = [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words and word.isalnum()]
    return ' '.join(words)

df = pd.read_csv('imdb.csv')
df['review'] = df['review'].apply(preprocess_text)

In [5]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X_tfidf = vectorizer.fit_transform(df['review'])
features = vectorizer.get_feature_names_out()

In [6]:
def enrich_features_with_wordnet(features):
    enriched_features = set(features)
    for word in features:
        for syn in wn.synsets(word):
            for lemma in syn.lemmas():
                enriched_features.add(lemma.name().replace('_', ' ').lower())
            for hyper in syn.hypernyms():
                enriched_features.update(lem.name().replace('_', ' ').lower() for lem in hyper.lemmas())
            for hypo in syn.hyponyms():
                enriched_features.update(lem.name().replace('_', ' ').lower() for lem in hypo.lemmas())
    return list(enriched_features)

In [7]:
enriched_features = enrich_features_with_wordnet(features)
enriched_vectorizer = TfidfVectorizer(vocabulary=enriched_features)
X_enriched = enriched_vectorizer.fit_transform(df['review'])

In [8]:
def compress(text):
    return brotli.compress(text.encode('utf-8'))

In [9]:
def calculate_ncd(text1, text2):
    c_x1 = compress(text1)
    c_x2 = compress(text2)
    c_x1x2 = compress(text1 + text2)
    ncd = (len(c_x1x2) - min(len(c_x1), len(c_x2))) / max(len(c_x1), len(c_x2))
    return ncd

In [10]:
def build_word_network(features):
    G = nx.Graph()
    for word1, word2 in combinations(features, 2):
        ncd = calculate_ncd(word1, word2)
        G.add_edge(word1, word2, weight=ncd)
    return G
G = build_word_network(features[:100]) 

In [11]:
degree_centrality = nx.degree_centrality(G)
betweenness_centrality = nx.betweenness_centrality(G)
closeness_centrality = nx.closeness_centrality(G)

In [12]:
def calculate_centrality_features(document_features, centrality_dict):
    centrality_features = []
    for doc_idx in range(document_features.shape[0]):
        doc_feature_indices = document_features[doc_idx].nonzero()[1]
        centrality_scores = [centrality_dict.get(features[index], 0) for index in doc_feature_indices if features[index] in centrality_dict]
        if centrality_scores:
            centrality_features.append(np.mean(centrality_scores))
        else:
            centrality_features.append(0)
    return np.array(centrality_features).reshape(-1, 1)

In [13]:
degree_features = calculate_centrality_features(X_tfidf, degree_centrality)
betweenness_features = calculate_centrality_features(X_tfidf, betweenness_centrality)
closeness_features = calculate_centrality_features(X_tfidf, closeness_centrality)

In [14]:
degree_csr = csr_matrix(degree_features)
betweenness_csr = csr_matrix(betweenness_features)
closeness_csr = csr_matrix(closeness_features)

In [15]:
X_combined = hstack([X_tfidf, X_enriched, degree_csr, betweenness_csr, closeness_csr])
X_train, X_test, y_train, y_test = train_test_split(X_combined, df['sentiment'], test_size=0.2, random_state=42)

In [16]:
knn = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

In [17]:
print(f'Accuracy: {accuracy_score(y_test, y_pred):.4f}')
print(classification_report(y_test, y_pred))

Accuracy: 0.7335
              precision    recall  f1-score   support

    negative       0.76      0.68      0.72      4961
    positive       0.71      0.79      0.75      5039

    accuracy                           0.73     10000
   macro avg       0.74      0.73      0.73     10000
weighted avg       0.74      0.73      0.73     10000

