In [2]:
import pandas as pd
import string
from nltk.corpus import stopwords
stop_words = set(stopwords.words('russian'))
punctuation_symbols = set(string.punctuation)
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

**Attempt 1: The shortened version of the dataset**

In [None]:
df = pd.read_csv('/short_dataset.csv')

In [None]:
def get_clean_tokens(sentence):
    tokens = re.findall("\w+", sentence.lower())
    tokens_no_stops = [word for word in tokens if (word not in stop_words) and (word not in punctuation_symbols)]
    tokens_no_singles = [token for token in tokens_no_stops if len(token) > 4]
    morph = pymorphy2.MorphAnalyzer()
    lemmatized_tokens = [morph.parse(token)[0].normal_form for token in tokens_no_singles]
    return lemmatized_tokens

sents_as_tokens = [get_clean_tokens(sentence) for sentence in df['Review']]
sents_as_strings = [' '.join(sent) for sent in sents_as_tokens]

In [None]:
vectorizer = TfidfVectorizer(max_features=30000, min_df=7, max_df=0.8, stop_words=stopwords.words('russian'))
processed_features = vectorizer.fit_transform(df['Review']).toarray()

In [None]:
def sent_changer(sentiment):
    if sentiment == 'positive':
        x = 1
    else:
        x = 0
    return x
sentiments = df['Sentiment'].apply(sent_changer)

Le = LabelEncoder()

y = Le.fit_transform(sentiments)

y.shape

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(processed_features, y, test_size=0.2, random_state=5)

In [None]:
MNB = MultinomialNB()
MNB.fit(X_train, Y_train)

In [None]:
predicted = MNB.predict(X_test)
accuracy_score = metrics.accuracy_score(predicted, Y_test)
f1_score = metrics.f1_score(predicted, Y_test)
print(str('{:.1%}'.format(accuracy_score)))
print(str('{:.1%}'.format(f1_score)))

**Attempt 2: The full version of the dataset**

In [4]:
df = pd.read_csv('/Users/urijzuzaev/Desktop/Thesis/reviews_dataset/combined_dataset.csv')
df_pos = df[df['Sentiment']=='positive']
df_neg = df[df['Sentiment']=='negative']
df = pd.concat([df_pos, df_neg])
df = df.sample(frac=1).reset_index(drop=True)

In [5]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=4000, min_df=7, max_df=0.8, stop_words=stopwords.words('russian'))
processed_features = vectorizer.fit_transform(df['Review']).toarray()

In [6]:
def sent_changer(sentiment):
    if sentiment == 'positive':
        x = 1
    else:
        x = 0
    return x
sentiments = df['Sentiment'].apply(sent_changer)

from sklearn.preprocessing import LabelEncoder

Le = LabelEncoder()

y = Le.fit_transform(sentiments)

y.shape

(71239,)

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(processed_features, y, test_size=0.2, random_state=5)

In [8]:
from sklearn.naive_bayes import MultinomialNB
MNB = MultinomialNB()
MNB.fit(X_train, Y_train)

MultinomialNB()

In [9]:
from sklearn import metrics
predicted = MNB.predict(X_test)
accuracy_score = metrics.accuracy_score(predicted, Y_test)
f1_score = metrics.f1_score(predicted, Y_test)

In [10]:
print(str('{:.1%}'.format(accuracy_score)))
print(str('{:.1%}'.format(f1_score)))

86.0%
92.5%
