In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from scipy.sparse import hstack

In [7]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
store = pd.read_csv("data/steam_store_data_2024.csv")
spy = pd.read_csv("data/output_steamspy.csv")

In [8]:
# тут попробуем обработать spy датасетик
def parse_owners(owners_range):
    try:
        low, high = owners_range.replace(",", "").split(" .. ")
        return (int(low) + int(high)) // 2
    except:
        return np.nan

spy["owners_est"] = spy["owners"].apply(parse_owners)

spy.rename(columns={"appid": "app_id"}, inplace=True)

In [9]:
# аналогично для стора
def parse_price(price_str):
    try:
        return float(price_str.replace("$", ""))
    except:
        return np.nan

def parse_discount(sale_str):
    try:
        return int(sale_str.replace("-", "").replace("%", ""))
    except:
        return 0

store["price_num"] = store["price"].apply(parse_price)
store["sale_pct"] = store["salePercentage"].apply(parse_discount)

# закодируем LabelEncoder
for col in ["recentReviews", "allReviews"]:
    store[col] = store[col].astype(str)
    le = LabelEncoder()
    store[col + "_enc"] = le.fit_transform(store[col])

# переименуем для join
store.rename(columns={"title": "name"}, inplace=True)

In [10]:
game_data = pd.merge(spy, store, on="name", how="inner")
game_data_small = game_data[["app_id", "owners_est", "price_num", "sale_pct", "recentReviews_enc", "allReviews_enc"]]

In [11]:
train_full = pd.merge(train, game_data_small, on="app_id", how="left")
test_full = pd.merge(test, game_data_small, on="app_id", how="left")

In [None]:
# обработаем текст
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

def clean_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", " ", text)
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words and len(word) > 2]
    return " ".join(tokens)

train_full["clean_content"] = train_full["content"].apply(clean_text)
test_full["clean_content"] = test_full["content"].apply(clean_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tmaxell/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
X_text_train = tfidf.fit_transform(train_full["clean_content"])
X_text_test = tfidf.transform(test_full["clean_content"])

In [None]:
numeric_cols = ["owners_est", "price_num", "sale_pct", "recentReviews_enc", "allReviews_enc"]
X_numeric_train = train_full[numeric_cols].fillna(0)
X_numeric_test = test_full[numeric_cols].fillna(0)

In [None]:
X_train = hstack([X_text_train, X_numeric_train])
X_test = hstack([X_text_test, X_numeric_test])
y_train = train_full["is_positive"]