if you're working in Google Colab add code:
> `!pip install -r requirements.txt`


In [None]:
reviews = pd.read_csv('steam.csv', header=None, names=['app_id', 'text_review', 'review_score', 'review_votes'])

In [None]:
reviews.head()

In [None]:
print(reviews.info())

In [None]:
print(reviews.shape)
print("rows, columns")

### Preprocessing

In [None]:
# printing the number of null values for each feature
print(reviews.isnull().sum())

In [None]:
# removing rows where text_review is empty
reviews = reviews.dropna()

In [None]:
print(reviews.isnull().sum())

In [None]:
reviews.shape

In [None]:
# cutting dataset by half to make preprocessing and training faster (since it's a large set)
_, reviews_sampled = train_test_split(
    reviews,
    test_size=0.5,
    stratify=reviews['review_score'],
    random_state=42
)
reviews = reviews_sampled.copy()

In [None]:
reviews.shape

In [None]:
# stopwords
print(STOPWORDS)

In [None]:
# function for text normalization (to lower, removing whitespaces, numbers etc)
def preprocess_data(reviews):
  if isinstance(reviews, str):
    reviews = reviews.lower()
    reviews = strip_multiple_whitespaces(reviews)
    reviews = strip_punctuation(strip_numeric(reviews))
    reviews = reviews.strip()

  return reviews

In [None]:
# adding custom stopwords that appear in dataset
custom_stopwords = set(STOPWORDS)
custom_stopwords.update(['ô', '•', 'AAA', 'ù§', 'Ä¢', 'ñë', 'ñÑ', 'ñà'])
custom_stopwords = frozenset(custom_stopwords)

In [None]:
# function that removes stopwords
def remove_custom_stopwords(reviews):
  return remove_stopwords(reviews, stopwords = custom_stopwords)

In [None]:
# applying both functions
reviews['clean_reviews'] = reviews['text_review'].apply(preprocess_data)
reviews['stopwords_removed'] = reviews['clean_reviews'].apply(remove_custom_stopwords)

# printing few cleaned reviews
reviews.head()

### Data visualization

In [None]:
# showing how many reviews are in each category (1 -1)
temp = reviews.groupby('review_score').count()['text_review'].reset_index().sort_values(by='text_review',ascending=False)
temp.style.background_gradient(cmap='Purples')

In [None]:
sns.set_theme(style='whitegrid')
sns.set(rc = {'figure.figsize':(13,8)})
sns.set_palette("pastel")
sns.countplot(x='review_score',data=reviews)

### Dividing data into training and test sets

In [None]:
X = reviews['stopwords_removed'] # working on cleaned data
y = reviews['review_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# turning reviews into vectors
vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [None]:
vectorizer.get_feature_names_out()

### Balancing data

In [None]:
# we use undersampling to balance both classes to have the same amount of examples for training
# since there's much more positive reviews than negative
rus = RandomUnderSampler(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled  = rus.fit_resample(X_train_tfidf, y_train)

In [None]:
# number of data in each class after balancing
y_train_resampled.value_counts()

In [None]:
sns.set_theme(style='whitegrid')
sns.set(rc={'figure.figsize': (13, 8)})
sns.set_palette("pastel")
sns.countplot(x=y_train_resampled)