In [1]:

# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)|
import numpy as np 
import pandas as pd 
import random as rn
import re
import nltk
import os


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
rn.seed(a=42)
p = 0.004  # to randomly select n% of the rows

df_reviews = pd.read_csv('/kaggle/input/steam-reviews/dataset.csv', 
                         skiprows=lambda i: i>0 and rn.random() > p)

# size of dataframe
print(df_reviews.shape)
# display the head of data
display(df_reviews.head())

In [3]:
# convert review text to string
df_reviews["review_text"] = df_reviews["review_text"].astype(str)
df_reviews["review_votes"] = df_reviews["review_votes"].astype(str)
df_reviews.review_text = df_reviews.review_text.apply(lambda s: s.strip())

# drop the reviews with null score
df_reviews_2 = df_reviews[df_reviews["review_score"].notnull()]

# change the scores from 1, -1 to 1 and 0
df_reviews_2["review_score"] = \
np.where(df_reviews_2["review_score"]==-1, 0, df_reviews_2["review_score"])

In [4]:
## Let's remove the "Early Access Review" comments. 
# These are the reviews with no comments writen by a human/reviewer. 
df_reviews_2 = df_reviews_2[df_reviews_2.review_text != "Early Access Review"]
df_reviews_2 = df_reviews_2[~df_reviews_2.review_text.isin(['nan'])]
print(df_reviews_2.shape)

# Drop duplicates if there is any
df_reviews_2.drop_duplicates(['review_text', 'review_score'], inplace = True)
print(df_reviews_2.shape)

In [5]:
## Text Cleaning
def replace_hearts_with_PAD(text):
    return re.sub(r"[♥]+", ' **** ' ,text)
df_reviews_2['review_text_clean'] = df_reviews_2.review_text.apply(replace_hearts_with_PAD)

In [6]:
neg_reviews = df_reviews_2[df_reviews_2.review_score == 0]
neg_reviews = neg_reviews.sample(n=2000, random_state = 1234)
all_intents = neg_reviews.review_text_clean.tolist()

print(neg_reviews.shape)
display(neg_reviews.head())

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfIdfVectorizer = TfidfVectorizer(use_idf=True, stop_words='english', ngram_range=(1, 1), sublinear_tf=False)
X = tfIdfVectorizer.fit_transform(df_reviews_2['review_text_clean'])
tfIdfVectorizer.get_feature_names_out()


In [8]:
y = df_reviews_2['review_score']

In [9]:
!pip install dask

In [10]:
import dask
import dask.array as da
X_d = da.from_array(X, chunks=(1000, 1000))
u, s, v = da.linalg.svd_compressed(X_d, k=300)


In [11]:
L = np.array(u.dot(np.diag(np.sqrt(s))))

In [12]:
R = np.array(np.diag(np.sqrt(s)).dot(v))

In [13]:
r = L.dot(R)

In [14]:
r = r + np.abs(r.min())

In [15]:
idx = df_reviews_2.groupby('app_id').indices

In [16]:
names = {n:df_reviews_2[df_reviews_2['app_id'] == n]['app_name'].iloc[0] for n in list(idx.keys())}

In [17]:
names

In [18]:
n = 570
print(names[n])
tfIdfVectorizer.get_feature_names_out()[(-r[idx[n]].sum(axis=0)).argsort()][:30]

In [19]:
import matplotlib.pyplot as plt



In [20]:
L[idx[570]].dot(R).shape

In [21]:
tfIdfVectorizer.get_feature_names_out()[(-L[idx[570]].dot(R).sum(axis=0)).argsort()][:10]

In [22]:
mu = df_reviews_2['review_score'].mean()

In [23]:
X[0]

In [24]:
import numpy as np
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=300, n_iter=7, random_state=42)

X_new = svd.fit_transform(X)


In [25]:

new_df = pd.DataFrame(X_new)
new_df['review_votes'] = df_reviews_2.review_votes.astype(np.float64)
new_df['review_score'] = df_reviews_2.review_score

In [26]:
import numpy as np
from sklearn.model_selection import train_test_split

X_data = new_df.drop(['review_score'], axis=1).fillna(0)
y = new_df['review_score'].fillna(0)

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X_data, y, test_size=0.33, random_state=42)

In [28]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

model = XGBClassifier(n_estimators=1000,
                      max_depth=8, 
                      tree_method='gpu_hist', 
                      objective='binary:hinge', 
                      learning_rate=0.1, 
                      reg_alpha=3, 
                      reg_lambda=3)


In [29]:
model.fit(X_train, y_train, early_stopping_rounds=30, eval_set=[(X_test, y_test)], verbose=True)
preds = model.predict(X_test)
print(accuracy_score(preds, y_test))