In [85]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from scipy.sparse import hstack, csr_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.calibration import CalibratedClassifierCV
import matplotlib.pyplot as plt
from sklearn.inspection import permutation_importance

In [86]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
store = pd.read_csv("data/steam_store_data_2024.csv")
spy = pd.read_csv("data/output_steamspy.csv")

In [87]:
# тут попробуем обработать spy датасетик
def parse_owners(owners_range):
    try:
        low, high = owners_range.replace(",", "").split(" .. ")
        return (int(low) + int(high)) // 2
    except:
        return np.nan

spy["owners_est"] = spy["owners"].apply(parse_owners)

spy.rename(columns={"appid": "app_id"}, inplace=True)

In [88]:
# аналогично для стора
def parse_price(price_str):
    try:
        return float(price_str.replace("$", ""))
    except:
        return np.nan

def parse_discount(sale_str):
    try:
        return int(sale_str.replace("-", "").replace("%", ""))
    except:
        return 0

store["price_num"] = store["price"].apply(parse_price)
store["sale_pct"] = store["salePercentage"].apply(parse_discount)

# закодируем LabelEncoder
for col in ["recentReviews", "allReviews"]:
    store[col] = store[col].astype(str)
    le = LabelEncoder()
    store[col + "_enc"] = le.fit_transform(store[col])

# переименуем для join
store.rename(columns={"title": "name"}, inplace=True)

In [89]:
game_data = pd.merge(spy, store, on="name", how="inner")
game_data_small = game_data[["app_id", "owners_est", "price_num", "sale_pct", "recentReviews_enc", "allReviews_enc"]]

In [90]:
train_full = pd.merge(train, game_data_small, on="app_id", how="left")
test_full = pd.merge(test, game_data_small, on="app_id", how="left")

In [91]:
# обработаем текст
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

def clean_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", " ", text)
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words and len(word) > 2]
    return " ".join(tokens)

train_full["clean_content"] = train_full["content"].apply(clean_text)
test_full["clean_content"] = test_full["content"].apply(clean_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tmaxell/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
X_text_train = tfidf.fit_transform(train_full["clean_content"])
X_text_test = tfidf.transform(test_full["clean_content"])

In [None]:
numeric_cols = ["owners_est", "price_num", "sale_pct", "recentReviews_enc", "allReviews_enc"]
X_numeric_train = train_full[numeric_cols].fillna(0)
X_numeric_test = test_full[numeric_cols].fillna(0)

In [None]:
X_train = hstack([X_text_train, X_numeric_train])
X_test = hstack([X_text_test, X_numeric_test])
y_train = train_full["is_positive"]

In [None]:
X_train = csr_matrix(X_train)
X_test = csr_matrix(X_test)

# Целевая переменная
y = y_train.values

# Модель
lr = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)

# Кросс-валидация Stratified по F1
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
f1_scores = cross_val_score(lr, X_train, y, cv=cv, scoring='f1')

print("F1 scores по фолдам:", f1_scores)
print("Средний F1:", np.mean(f1_scores))

# Обучаем модель на всех данных
lr.fit(X_train, y)

# Предсказание на тесте
test_preds = lr.predict(X_test)

# Формируем submission
submission = pd.DataFrame({
    "id": test_full["id"],
    "sentiment": test_preds
})

submission.to_csv("submission.csv", index=False)
print("Файл submission.csv сохранён.")

F1 scores по фолдам: [0.83583977 0.83534673 0.83696554 0.84028633 0.83195866]
Средний F1: 0.83607940413081
Файл submission.csv сохранён.


In [None]:
def count_punct(text):
    return sum([1 for c in text if c in ".,;:!?"])

def count_upper(text):
    if len(text) == 0:
        return 0
    return sum(1 for c in text if c.isupper()) / len(text)

def count_words(text):
    return len(text.split())

for df in [train_full, test_full]:
    df["text_len"] = df["content"].fillna("").apply(len)
    df["word_count"] = df["content"].fillna("").apply(count_words)
    df["punct_count"] = df["content"].fillna("").apply(count_punct)
    df["upper_ratio"] = df["content"].fillna("").apply(count_upper)


In [None]:
numeric_cols = [
    "owners_est", "price_num", "sale_pct",
    "recentReviews_enc", "allReviews_enc",
    "text_len", "word_count", "punct_count", "upper_ratio"
]

X_numeric_train = train_full[numeric_cols].fillna(0)
X_numeric_test = test_full[numeric_cols].fillna(0)

from scipy.sparse import hstack, csr_matrix

X_train = hstack([X_text_train, csr_matrix(X_numeric_train.values)])
X_test = hstack([X_text_test, csr_matrix(X_numeric_test.values)])


In [None]:
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV

# Преобразуем в формат для XGB и LGB (лучше не использовать sparse)
X_train_dense = X_train.todense()
X_test_dense = X_test.todense()

y = train_full["is_positive"].values

# XGBoost
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False,
    random_state=42,
    n_jobs=-1
)

xgb_params = {
    'max_depth': [3, 5],
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.8, 1]
}

xgb_grid = GridSearchCV(xgb_model, xgb_params, scoring='f1', cv=3, verbose=1)
xgb_grid.fit(X_train_dense, y)

print("Лучшие параметры XGB:", xgb_grid.best_params_)
print("Лучший F1 XGB:", xgb_grid.best_score_)

# LightGBM
lgb_model = lgb.LGBMClassifier(random_state=42, n_jobs=-1)

lgb_params = {
    'max_depth': [3, 5],
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.8, 1]
}

lgb_grid = GridSearchCV(lgb_model, lgb_params, scoring='f1', cv=3, verbose=1)
lgb_grid.fit(X_train_dense, y)

print("Лучшие параметры LGB:", lgb_grid.best_params_)
print("Лучший F1 LGB:", lgb_grid.best_score_)

# Выбираем лучшую модель
best_model = xgb_grid.best_estimator_ if xgb_grid.best_score_ > lgb_grid.best_score_ else lgb_grid.best_estimator_

# Обучаем на всех данных
best_model.fit(X_train_dense, y)

# Предсказание
test_preds = best_model.predict(X_test_dense)

# Сохраняем submission
submission = pd.DataFrame({
    "id": test_full["id"],
    "sentiment": test_preds
})
submission.to_csv("submission_advanced.csv", index=False)
print("submission_advanced.csv сохранён.")

In [None]:
from sklearn.calibration import CalibratedClassifierCV

calibrated_clf = CalibratedClassifierCV(best_model, cv='prefit', method='isotonic')
calibrated_clf.fit(X_train_dense, y)

# Калиброванное предсказание вероятностей
test_probs = calibrated_clf.predict_proba(X_test_dense)[:,1]

# Перекодируем в классы по порогу 0.5
test_preds_calibrated = (test_probs > 0.5).astype(int)

# Сохраняем калиброванный сабмишн
submission_calibrated = pd.DataFrame({
    "id": test_full["id"],
    "sentiment": test_preds_calibrated
})
submission_calibrated.to_csv("submission_calibrated.csv", index=False)
print("submission_calibrated.csv сохранён.")


In [None]:
import shap
import matplotlib.pyplot as plt
from sklearn.inspection import permutation_importance

# SHAP для XGBoost/LightGBM
explainer = shap.Explainer(best_model, X_train_dense)
shap_values = explainer(X_train_dense)

# Визуализация важности признаков
shap.summary_plot(shap_values, features=X_train_dense, feature_names=list(tfidf.get_feature_names_out()) + numeric_cols)

# Permutation Importance
result = permutation_importance(best_model, X_train_dense, y, scoring='f1', n_repeats=10, random_state=42, n_jobs=-1)

sorted_idx = result.importances_mean.argsort()[::-1]
feature_names = list(tfidf.get_feature_names_out()) + numeric_cols

plt.figure(figsize=(10,6))
plt.barh(range(20), result.importances_mean[sorted_idx][:20], align='center')
plt.yticks(range(20), np.array(feature_names)[sorted_idx][:20])
plt.xlabel("Permutation Importance (mean)")
plt.title("Top 20 важнейших признаков")
plt.gca().invert_yaxis()
plt.show()
