In [85]:
# !pip3 install nltk

import re
import sys
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.sequence import pad_sequences

from nltk.tokenize import word_tokenize
import nltk

sys.path.append("/Users/wzq/Desktop/game")
from utils import load_glove_embeddings, sentence_to_embedding, save_zip_file

# jupyter配置
from IPython.display import display
pd.options.display.max_rows=10000 #Notebook 的一个cell的显示行数
pd.options.display.max_columns=100000#Notebook 的一个cell的显示列数
pd.set_option('display.max_colwidth', None)

In [129]:
## 数据处理
train_data = pd.read_csv('./data/train.csv')
test_data = pd.read_csv('./data/test.csv')

duplicated_samples5 = train_data[train_data['rating'] == 5].sample(n=100000, random_state=42, replace=True)
duplicated_samples4 = train_data[train_data['rating'] == 4].sample(n=100000, random_state=42, replace=True)
# 类别均衡
duplicated_samples3 = train_data[train_data['rating'] == 3]
duplicated_samples2 = train_data[train_data['rating'] == 2]
duplicated_samples1 = train_data[train_data['rating'] == 1]
train_data = pd.concat([duplicated_samples1, duplicated_samples2, duplicated_samples3,duplicated_samples2,duplicated_samples1, duplicated_samples4,duplicated_samples5,duplicated_samples1], axis=0, ignore_index=True)
train_data['rating'].value_counts()

rating
4    100000
5    100000
1     87447
3     79870
2     70892
Name: count, dtype: int64

In [131]:
# 计算用户特征
user_features = train_data.groupby('user_id').agg({
    'rating': ['mean', 'count', 'std', 'var'],
    'votes': ['mean', 'max'],
    'helpful_votes': ['mean', 'max']
}).reset_index()

user_features.columns = ['user_id', 'user_rating_mean', 'user_rating_count', 'user_rating_std', 'user_rating_var', 
                         'user_votes_mean', 'user_votes_max', 
                         'user_helpful_votes_mean', 'user_helpful_votes_max']
user_features['user_votes_helpful_rate'] = user_features['user_helpful_votes_mean'] / user_features['user_votes_mean']
user_features['user_helpful_rating'] = user_features['user_votes_helpful_rate'] * user_features['user_rating_mean']
user_features.head()

Unnamed: 0,user_id,user_rating_mean,user_rating_count,user_rating_std,user_rating_var,user_votes_mean,user_votes_max,user_helpful_votes_mean,user_helpful_votes_max,user_votes_helpful_rate,user_helpful_rating
0,0,4.313253,83,0.714445,0.510432,3.819277,13,2.771084,11,0.725552,3.12949
1,1,4.77193,57,0.500626,0.250627,2.245614,9,1.105263,5,0.492188,2.348684
2,2,2.913386,127,1.266313,1.60355,2.76378,25,0.84252,9,0.304843,0.888126
3,3,2.917683,328,1.207491,1.458035,9.893293,69,5.554878,32,0.561479,1.638218
4,4,3.335196,179,1.244963,1.549934,5.324022,24,3.910615,20,0.734523,2.449776


In [132]:
# 计算产品特征
product_features = train_data.groupby('product_id').agg({
    'rating': ['mean', 'count', 'std', 'var'],
    'votes': ['mean', 'max'],
    'helpful_votes': ['mean', 'max']
}).reset_index()
product_features.columns = ['product_id', 'product_rating_mean', 'product_rating_count', 'product_rating_std', 'product_rating_var',
                            'product_votes_mean', 'product_votes_max', 'product_helpful_votes_mean', 'product_helpful_votes_max']
product_features['product_votes_helpful_rate'] = product_features['product_helpful_votes_mean'] / product_features['product_votes_mean']
product_features['product_helpful_rating'] = product_features['product_votes_helpful_rate'] * product_features['product_rating_mean']

product_features.head()
glove_embeddings = load_glove_embeddings('/Users/wzq/Desktop/coding/glove.6B/glove.6B.50d.txt')


In [133]:
## 数据处理
train_data = pd.read_csv('./data/train.csv')
test_data = pd.read_csv('./data/test.csv')
# 生成用户和产品特征
train_data = train_data.merge(user_features, on='user_id').merge(product_features, on='product_id')
test_data = test_data.merge(user_features, on='user_id', how='left').merge(product_features, on='product_id', how='left')



# 分词并加载词嵌入
def sentence_to_embedding(sentence, embeddings_index, enable_mean=False, embedding_dim=50):
    words = word_tokenize(sentence.lower())  # 分词，并转化为小写
    embeddings = []
    for word in words:
        if word in embeddings_index:
            embeddings.append(embeddings_index[word])
        else:
            embeddings.append(np.zeros(embedding_dim))  # 若词不在 GloVe 中，则用零向量替代
    if enable_mean:
        embeddings = np.mean(embeddings, axis=0)
    return embeddings

# 文本特征
# 假设路径为 'glove.6B.100d.txt'
glove_embeddings = load_glove_embeddings('/Users/wzq/Desktop/coding/glove.6B/glove.6B.50d.txt')
train_sentence_embeddings = train_data['product_name'].apply(lambda x: sentence_to_embedding(x, glove_embeddings, enable_mean=True, embedding_dim=50))
train_sentence_embeddings = pd.DataFrame(train_sentence_embeddings.tolist())
test_sentence_embeddings = test_data['product_name'].apply(lambda x: sentence_to_embedding(x, glove_embeddings, enable_mean=True, embedding_dim=50))
test_sentence_embeddings = pd.DataFrame(test_sentence_embeddings.tolist())
# display(test_sentence_embeddings)

# embeddings_padded = pad_sequences(sentence_embedding_mean.tolist(), maxlen=14 * 50, dtype='float32', padding='post', truncating='post')
for i in range(50):
    cn = 'product_name_embeddings_' + str(i)
    train_data[cn] = train_sentence_embeddings.iloc[:, i]
    test_data[cn] = test_sentence_embeddings.iloc[:, i]

# 填充缺失值
train_data.fillna(0, inplace=True)
test_data.fillna(0, inplace=True)  # 或者使用其他合适的填充方法

# 保存训练测试数据
train_data.to_csv('./data/train_processed_bls_cls3.csv')
test_data.to_csv('./data/test_processed_bls_cls3.csv')
# train_data.to_csv('./data/train_processed_bls_cls2.csv')
# test_data.to_csv('./data/test_processed_bls_cls2.csv')
# train_data.to_csv('./data/train_processed_bls_cls.csv')
# test_data.to_csv('./data/test_processed_bls_cls.csv')
# train_data.to_csv('./data/train_processed.csv')
# test_data.to_csv('./data/test_processed.csv')
# train_data.to_csv('./data/train_processed_product_name.csv')
# test_data.to_csv('./data/test_processed_product_name.csv')
display(train_data)
display(test_data)

Unnamed: 0,user_id,product_id,product_name,rating,votes,helpful_votes,ID,user_rating_mean,user_rating_count,user_rating_std,user_rating_var,user_votes_mean,user_votes_max,user_helpful_votes_mean,user_helpful_votes_max,user_votes_helpful_rate,user_helpful_rating,product_rating_mean,product_rating_count,product_rating_std,product_rating_var,product_votes_mean,product_votes_max,product_helpful_votes_mean,product_helpful_votes_max,product_votes_helpful_rate,product_helpful_rating,product_name_embeddings_0,product_name_embeddings_1,product_name_embeddings_2,product_name_embeddings_3,product_name_embeddings_4,product_name_embeddings_5,product_name_embeddings_6,product_name_embeddings_7,product_name_embeddings_8,product_name_embeddings_9,product_name_embeddings_10,product_name_embeddings_11,product_name_embeddings_12,product_name_embeddings_13,product_name_embeddings_14,product_name_embeddings_15,product_name_embeddings_16,product_name_embeddings_17,product_name_embeddings_18,product_name_embeddings_19,product_name_embeddings_20,product_name_embeddings_21,product_name_embeddings_22,product_name_embeddings_23,product_name_embeddings_24,product_name_embeddings_25,product_name_embeddings_26,product_name_embeddings_27,product_name_embeddings_28,product_name_embeddings_29,product_name_embeddings_30,product_name_embeddings_31,product_name_embeddings_32,product_name_embeddings_33,product_name_embeddings_34,product_name_embeddings_35,product_name_embeddings_36,product_name_embeddings_37,product_name_embeddings_38,product_name_embeddings_39,product_name_embeddings_40,product_name_embeddings_41,product_name_embeddings_42,product_name_embeddings_43,product_name_embeddings_44,product_name_embeddings_45,product_name_embeddings_46,product_name_embeddings_47,product_name_embeddings_48,product_name_embeddings_49
0,1813,154533,Beautiful Thing,5,10,8,0,3.129129,333,1.411493,1.992312,7.873874,46,4.474474,32,0.568268,1.778186,4.8,5,0.447214,0.2,5.6,9,5.2,8,0.928571,4.457143,0.268777,0.704065,-0.704850,-0.046943,0.968450,0.016998,-0.354835,-0.106115,-0.204095,0.584430,-0.540680,0.216230,-0.051420,0.228735,0.667465,0.111256,0.746360,0.454310,0.051473,-0.558030,-0.694385,0.790550,-0.134327,0.267640,1.188850,-1.233670,-1.544600,0.812890,1.037625,-0.809220,2.534800,0.014870,0.088995,-0.118358,0.005690,0.203015,-0.046273,0.437550,-0.247170,-0.671570,-0.378545,0.126550,0.087245,0.062750,0.059166,0.266091,0.013202,-0.767775,0.244955,0.535740
1,1916,154533,Beautiful Thing,4,2,2,681987,3.177087,57181,1.636421,2.677874,9.088841,290,5.062398,274,0.556991,1.769607,4.8,5,0.447214,0.2,5.6,9,5.2,8,0.928571,4.457143,0.268777,0.704065,-0.704850,-0.046943,0.968450,0.016998,-0.354835,-0.106115,-0.204095,0.584430,-0.540680,0.216230,-0.051420,0.228735,0.667465,0.111256,0.746360,0.454310,0.051473,-0.558030,-0.694385,0.790550,-0.134327,0.267640,1.188850,-1.233670,-1.544600,0.812890,1.037625,-0.809220,2.534800,0.014870,0.088995,-0.118358,0.005690,0.203015,-0.046273,0.437550,-0.247170,-0.671570,-0.378545,0.126550,0.087245,0.062750,0.059166,0.266091,0.013202,-0.767775,0.244955,0.535740
2,506,154533,Beautiful Thing,5,6,6,288065,4.321429,140,1.005509,1.011048,2.264286,11,1.542857,9,0.681388,2.944570,4.8,5,0.447214,0.2,5.6,9,5.2,8,0.928571,4.457143,0.268777,0.704065,-0.704850,-0.046943,0.968450,0.016998,-0.354835,-0.106115,-0.204095,0.584430,-0.540680,0.216230,-0.051420,0.228735,0.667465,0.111256,0.746360,0.454310,0.051473,-0.558030,-0.694385,0.790550,-0.134327,0.267640,1.188850,-1.233670,-1.544600,0.812890,1.037625,-0.809220,2.534800,0.014870,0.088995,-0.118358,0.005690,0.203015,-0.046273,0.437550,-0.247170,-0.671570,-0.378545,0.126550,0.087245,0.062750,0.059166,0.266091,0.013202,-0.767775,0.244955,0.535740
3,923,154533,Beautiful Thing,5,9,8,365036,3.823529,85,1.025583,1.051821,7.847059,37,6.588235,37,0.839580,3.210160,4.8,5,0.447214,0.2,5.6,9,5.2,8,0.928571,4.457143,0.268777,0.704065,-0.704850,-0.046943,0.968450,0.016998,-0.354835,-0.106115,-0.204095,0.584430,-0.540680,0.216230,-0.051420,0.228735,0.667465,0.111256,0.746360,0.454310,0.051473,-0.558030,-0.694385,0.790550,-0.134327,0.267640,1.188850,-1.233670,-1.544600,0.812890,1.037625,-0.809220,2.534800,0.014870,0.088995,-0.118358,0.005690,0.203015,-0.046273,0.437550,-0.247170,-0.671570,-0.378545,0.126550,0.087245,0.062750,0.059166,0.266091,0.013202,-0.767775,0.244955,0.535740
4,1560,154533,Beautiful Thing,4,6,5,237732,3.328976,459,1.160978,1.347870,3.346405,13,2.111111,12,0.630859,2.100116,4.8,5,0.447214,0.2,5.6,9,5.2,8,0.928571,4.457143,0.268777,0.704065,-0.704850,-0.046943,0.968450,0.016998,-0.354835,-0.106115,-0.204095,0.584430,-0.540680,0.216230,-0.051420,0.228735,0.667465,0.111256,0.746360,0.454310,0.051473,-0.558030,-0.694385,0.790550,-0.134327,0.267640,1.188850,-1.233670,-1.544600,0.812890,1.037625,-0.809220,2.534800,0.014870,0.088995,-0.118358,0.005690,0.203015,-0.046273,0.437550,-0.247170,-0.671570,-0.378545,0.126550,0.087245,0.062750,0.059166,0.266091,0.013202,-0.767775,0.244955,0.535740
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
626133,1850,68934,Christina Aguilera,3,2,0,308330,2.541176,85,1.570206,2.465546,3.282353,13,1.788235,13,0.544803,1.384440,3.0,1,0.000000,0.0,2.0,2,0.0,0,0.000000,0.000000,-0.107687,0.383840,-0.503785,0.292935,-0.135358,0.405810,-0.468905,0.094495,-0.496455,0.462050,0.369300,0.412891,-0.220510,0.109210,0.774190,-0.468890,-0.695290,-0.788960,0.734905,0.153142,-0.582040,0.494575,0.242900,-0.115213,0.238495,0.111325,-0.030545,-0.010733,-0.894780,-1.424940,0.528670,-0.078103,0.608700,0.237630,-0.340450,-0.552400,0.331356,-0.033501,0.248120,-0.623675,-0.121405,0.140441,0.161580,-1.183100,0.329140,-0.780660,0.234350,-1.402450,-0.743550,0.911375
626134,1850,93762,Crafty Girl: Cool Stuff,1,6,5,346308,2.541176,85,1.570206,2.465546,3.282353,13,1.788235,13,0.544803,1.384440,1.0,3,0.000000,0.0,6.0,6,5.0,5,0.833333,0.833333,-0.287608,0.272768,-0.292728,-0.176264,0.451420,0.027846,-0.246985,-0.182565,-0.371716,0.416247,-0.342728,0.332362,-0.007798,0.464946,0.612393,0.019283,0.073924,0.320938,-0.141395,-0.353328,-0.320739,0.612942,0.442278,0.558374,0.451534,-1.047022,-0.780058,0.348870,0.915254,-0.385348,2.084086,0.058059,-0.157736,0.241030,0.141176,0.392803,-0.298320,0.026132,0.192714,-0.698054,0.103346,0.311204,0.061660,0.104004,0.524766,0.134058,0.512264,-0.426195,0.277659,0.482948
626135,1850,140202,Britney Spears,3,4,3,437598,2.541176,85,1.570206,2.465546,3.282353,13,1.788235,13,0.544803,1.384440,3.0,1,0.000000,0.0,4.0,4,3.0,3,0.750000,2.250000,-0.024990,0.221847,-0.311365,0.179465,-0.289515,0.541935,-0.144875,0.168316,-0.160712,0.768235,0.024143,0.198434,-0.786430,0.037910,1.215565,0.259915,-0.589145,-0.777785,0.493960,-0.538045,0.023655,1.004245,0.265425,0.663420,-0.921695,-0.176345,-0.305927,0.159429,0.121628,-1.323500,0.585955,-0.015920,0.275540,0.119440,0.152525,-0.221026,0.617360,-1.100200,-0.338090,-1.022855,0.239750,0.100465,0.226935,-0.322415,1.173260,-1.144225,0.402165,-1.504700,-0.868725,0.659340
626136,1850,79222,Get Wise! Mastering Writing Skills,5,0,0,449862,2.541176,85,1.570206,2.465546,3.282353,13,1.788235,13,0.544803,1.384440,5.0,1,0.000000,0.0,0.0,0,0.0,0,0.000000,0.000000,-0.428002,0.033678,-0.099913,-0.499353,0.219213,-0.195695,-0.306149,-0.135118,-0.175114,0.526192,-0.173643,0.654205,-0.232683,0.119131,0.226358,-0.261939,0.215126,0.376141,0.167775,-0.366666,0.213047,0.734962,0.294190,0.164725,0.837735,-1.030912,-0.733617,-0.359420,0.460348,-0.552865,2.440900,0.211667,-0.345238,-0.239127,0.032635,0.346335,-0.005088,0.304475,0.210065,-0.375245,0.228594,-0.068514,-0.119887,0.258302,-0.033174,-0.006719,0.407350,0.051236,-0.121967,0.654142


Unnamed: 0,ID,user_id,product_id,product_name,user_rating_mean,user_rating_count,user_rating_std,user_rating_var,user_votes_mean,user_votes_max,user_helpful_votes_mean,user_helpful_votes_max,user_votes_helpful_rate,user_helpful_rating,product_rating_mean,product_rating_count,product_rating_std,product_rating_var,product_votes_mean,product_votes_max,product_helpful_votes_mean,product_helpful_votes_max,product_votes_helpful_rate,product_helpful_rating,product_name_embeddings_0,product_name_embeddings_1,product_name_embeddings_2,product_name_embeddings_3,product_name_embeddings_4,product_name_embeddings_5,product_name_embeddings_6,product_name_embeddings_7,product_name_embeddings_8,product_name_embeddings_9,product_name_embeddings_10,product_name_embeddings_11,product_name_embeddings_12,product_name_embeddings_13,product_name_embeddings_14,product_name_embeddings_15,product_name_embeddings_16,product_name_embeddings_17,product_name_embeddings_18,product_name_embeddings_19,product_name_embeddings_20,product_name_embeddings_21,product_name_embeddings_22,product_name_embeddings_23,product_name_embeddings_24,product_name_embeddings_25,product_name_embeddings_26,product_name_embeddings_27,product_name_embeddings_28,product_name_embeddings_29,product_name_embeddings_30,product_name_embeddings_31,product_name_embeddings_32,product_name_embeddings_33,product_name_embeddings_34,product_name_embeddings_35,product_name_embeddings_36,product_name_embeddings_37,product_name_embeddings_38,product_name_embeddings_39,product_name_embeddings_40,product_name_embeddings_41,product_name_embeddings_42,product_name_embeddings_43,product_name_embeddings_44,product_name_embeddings_45,product_name_embeddings_46,product_name_embeddings_47,product_name_embeddings_48,product_name_embeddings_49
0,0,1916,185507,Maria [Australia],3.177087,57181,1.636421,2.677874,9.088841,290,5.062398,274,0.556991,1.769607,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,-0.127962,0.760284,0.033650,0.293735,0.133326,0.157034,-0.091510,-0.019397,-0.297530,-0.227621,0.257265,0.350088,-0.270132,-0.506997,0.450198,-0.155560,0.073660,-0.186825,-0.231528,0.273275,-0.368023,0.083898,0.445597,0.483992,0.582603,-0.707140,0.039945,-0.421305,-0.350972,-0.245525,2.197750,-0.142304,0.020252,-0.104307,-0.298925,-0.397750,0.493244,0.058490,0.206745,-0.356597,0.426220,-0.404720,0.360432,-0.575941,-0.168985,0.620425,-0.249920,-0.631971,0.307535,0.412900
1,1,1759,143430,"The Mitchells: Five for Victory (Van Stockum, Hilda, ""Mitchells"" Series.)",2.159091,396,1.658867,2.751841,7.818182,91,2.744949,22,0.351098,0.758053,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,-0.061768,0.290024,-0.100216,-0.021020,0.109157,0.226238,-0.398371,-0.341678,-0.243907,-0.285389,-0.098447,0.164986,-0.535192,-0.195681,0.342699,-0.021038,0.222990,-0.066948,-0.486469,-0.145144,-0.091811,0.224055,0.320318,0.096743,-0.009046,-0.917202,-0.397798,-0.093497,-0.000371,-0.312151,2.431537,-0.030600,-0.278046,0.125526,0.173048,-0.036254,0.243203,-0.091143,0.288496,-0.093532,0.074945,0.119532,0.117767,-0.343035,-0.001427,0.294062,-0.017395,-0.351465,0.088295,0.139058
2,2,1980,155861,Superman Cartoons of Max & Dav,2.605333,375,1.483951,2.202111,8.226667,50,3.562667,37,0.433063,1.128274,4.000000,1.0,0.000000,0.000000,57.000000,57.0,56.000000,56.0,0.982456,3.929825,-0.304495,0.167180,-0.113720,-0.001227,-0.048765,0.655275,-0.328147,-0.928585,-0.137822,-0.370920,0.034489,0.528562,-0.486520,-0.436582,0.431873,-0.125909,-0.020044,0.601835,-0.405915,0.094026,0.217900,0.130097,0.152542,0.250303,0.303188,-0.670387,-0.595923,-0.252234,-0.213731,-0.194479,1.248129,0.104120,0.056142,-0.297540,-0.362604,0.587309,0.054943,-0.398184,-0.168702,-0.112553,0.283088,0.202617,-0.445486,-0.477901,0.101660,0.550085,0.048121,-0.351010,-0.101229,0.035824
3,3,1502,71535,The Rainbow Fish Anniversary Edition,2.961404,16582,1.612352,2.599680,6.405801,380,1.857074,64,0.289905,0.858526,1.833333,6.0,1.169045,1.366667,21.000000,44.0,13.000000,27.0,0.619048,1.134921,-0.028603,0.400588,-0.624342,0.029958,0.468965,-0.216015,-0.834716,-0.557355,0.708126,-0.263244,0.006142,-0.014065,0.254106,0.522730,0.417060,-0.395692,-0.142170,-0.169282,-0.990246,-0.081396,0.298643,-0.498574,0.072523,-0.124486,0.011916,-0.934530,-0.826152,0.107506,-0.123441,-0.215588,2.314362,-0.123350,-0.512468,0.270784,-0.324215,0.039067,0.399435,-0.225956,-0.458706,-0.277173,-0.078240,-0.307158,-0.301600,-0.487471,0.037698,0.583928,0.061061,-0.537760,-0.144369,-0.365541
4,4,1255,13521,Confessions (Oxford World's Classics),1.976690,429,1.383959,1.915343,21.960373,162,9.130536,94,0.415773,0.821855,4.333333,6.0,0.516398,0.266667,7.333333,13.0,4.833333,9.0,0.659091,2.856061,-0.173634,0.616850,-0.377821,-0.024054,0.028313,-0.003706,-0.192506,-0.647884,-0.126601,0.119717,0.259958,0.371363,-0.508007,-0.195870,0.449019,-0.155303,-0.205652,0.249200,-0.785713,0.284739,0.162540,0.397328,0.336071,0.229033,-0.088791,-1.097267,-0.534676,-0.550170,-0.647570,-0.120188,2.562539,-0.229783,-0.212821,0.031881,-0.109053,-0.119501,0.177133,-0.038474,0.123723,-0.279069,0.554760,-0.144676,0.134539,-0.068775,-0.235763,0.529583,0.138481,-0.396277,-0.283272,-0.256310
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
223548,223548,1916,214704,"Godzilla, King of the Monsters",3.177087,57181,1.636421,2.677874,9.088841,290,5.062398,274,0.556991,1.769607,1.857143,7.0,0.899735,0.809524,9.571429,24.0,7.714286,21.0,0.805970,1.496802,0.410091,0.086337,-0.235960,0.244460,0.379452,0.180221,0.101693,-0.378660,-0.218282,0.008452,-0.028664,0.444486,-0.155880,-0.120452,0.409600,0.166420,-0.036156,0.266682,-0.918643,0.220898,0.083975,-0.131528,0.099591,-0.157672,0.068715,-1.460838,-0.823032,-0.247588,0.039837,-0.351253,2.289178,0.003365,-0.225706,-0.204568,0.007252,0.227559,-0.009937,-0.428073,-0.210756,-0.245180,-0.232975,0.261822,-0.117066,-0.085076,-0.090386,0.542517,-0.008391,-0.779783,-0.190348,-0.343293
223549,223549,1502,169489,Houses of the Holy,2.961404,16582,1.612352,2.599680,6.405801,380,1.857074,64,0.289905,0.858526,4.000000,15.0,1.133893,1.285714,7.733333,46.0,1.333333,3.0,0.172414,0.689655,0.841608,0.499777,-0.441780,-0.469032,0.685012,-0.063602,-0.297390,-0.228468,-0.069733,-0.391299,-0.270767,-0.343668,-0.003650,-0.399872,0.073792,0.139509,0.107171,-0.054267,-0.483280,0.151394,0.189307,-0.127260,-0.218750,-0.057654,-0.384425,-1.335274,-0.756673,-0.047440,-0.079184,-0.159740,3.269600,-0.224315,-0.243575,-0.233430,0.011796,0.344925,0.202168,-0.312858,0.031608,0.459764,-0.204035,-0.123490,-0.158017,0.257991,-0.072135,0.440195,-0.513949,-0.577805,-0.299442,-0.928587
223550,223550,874,150908,Frankenstein and the Monster from Hell,4.010942,2102,0.795259,0.632436,9.618934,227,8.303996,204,0.863297,3.462634,2.636364,11.0,1.361817,1.854545,4.090909,10.0,2.727273,8.0,0.666667,1.757576,0.409113,0.011100,-0.121428,-0.334104,0.169245,0.406202,-0.205720,-0.364872,-0.144278,-0.053057,-0.143688,0.336127,-0.297962,0.169490,0.490217,0.012372,0.242613,0.355915,-0.716638,-0.044168,0.129630,-0.083422,0.231654,0.150964,0.220388,-1.323462,-0.905052,0.059775,0.196431,-0.402482,2.323515,-0.050236,-0.282992,-0.104398,-0.145231,0.352620,-0.032197,-0.314936,0.312288,-0.300044,-0.234717,0.238423,-0.246091,-0.245436,0.008666,0.332809,0.197266,-0.858108,0.037638,-0.281256
223551,223551,922,198737,X-Men,3.958963,463,1.216962,1.480996,5.019438,42,3.732181,42,0.743546,2.943670,3.326923,52.0,1.339024,1.792986,15.019231,196.0,9.000000,141.0,0.599232,1.993598,0.765920,0.187770,0.529540,-0.270440,-0.005430,0.526460,-0.315670,0.157120,-0.225980,-0.013385,-0.125430,1.156100,-0.807620,0.674120,-0.614130,-1.105700,-0.312580,0.415660,-1.183000,0.299000,-1.010700,0.514370,0.834200,0.308290,0.106150,0.331250,-1.289400,-1.000100,0.000126,-0.036833,-0.441270,0.282460,-1.027700,-1.517400,0.224690,0.509690,-0.833340,-1.405500,-0.572480,-1.881300,0.177770,-0.888020,-0.477400,-0.891320,-0.850960,1.257500,0.745990,-0.560890,-0.162060,-0.417890


In [71]:
train_data['rating'].value_counts()
train_data.shape

(745889, 7)

In [84]:
duplicated_samples1 = train_data[train_data['rating'] == 1]
duplicated_samples2 = train_data[train_data['rating'] == 2]
a = pd.concat([train_data, duplicated_samples1, duplicated_samples2], axis=0, ignore_index=True)
a['rating'].value_counts()

rating
5    416231
4    185193
3     79870
2     70892
1     58298
Name: count, dtype: int64

In [105]:
a = pd.read_csv('./data/submit_example_A202410161035.csv')
# display(a['rating'].value_counts())
# a[a['rating'] > 5]['rating'] = 5
a.loc[a['rating'] > 5, 'rating'] = 5
# save_zip_file(a)
a['rating'] = a['rating'].round()
cnt = a['rating'].value_counts()
type(cnt)
for i in cnt:
    print(i, i / sum(cnt))

108373 0.48477542238305904
74501 0.33325877979718455
39342 0.1759851131499018
1335 0.005971738245516723
2 8.946424337852769e-06


In [114]:
l = [416231,185193,79870,35446,29149]
l = [101075, 76738, 41891, 3153, 642]
for i in l:
    print(i, i / sum(l))

101075 0.4522391599067557
76738 0.3433482923860957
41891 0.1874326059624428
3153 0.014107445670897856
642 0.0028724960738079363


In [9]:
a = np.array([[7.62842181e-01, 4.90758275e-03, 1.37024614e-01],
        [2.67379679e-03, 1.37931034e-02, 3.15333071e-03],
        [8.28423978e-01, 1.00000000e+00, 1.31421749e-01],
        [2.67379679e-03, 1.37931034e-02, 3.15333071e-03],
        [9.31983403e-01, 4.23190107e-03, 3.72131582e-02],
        [2.67379679e-03, 1.37931034e-02, 3.15333071e-03],
        [7.23851417e-01, 1.13799020e-03, 4.92168973e-02],
        [5.34759358e-04, 3.44827586e-03, 3.94166338e-04],
        [7.23851417e-01, 1.13799020e-03, 4.92168973e-02],
        [5.34759358e-04, 6.89655172e-03, 7.88332676e-04],
        [7.23851417e-01, 1.13799020e-03, 4.92168973e-02],
        [8.02139037e-04, 1.03448276e-02, 1.18249901e-03]])

In [118]:
t = [416231,185193,79870,70892,58298]
for i in t:
    print(i, i / sum(t))
sum(t)

416231 0.5135585650056016
185193 0.22849679944329562
79870 0.09854605396281728
70892 0.08746872239303922
58298 0.0719298591952463


810484

In [4]:
field_dims = [5, 10, 15]  # 三个字段，每个字段有不同数量的类别
offsets = np.array((0, *np.cumsum(field_dims)[:-1]))
offsets

array([ 0,  5, 15, 30])

In [51]:
from utils import save_zip_file


re = pd.read_csv('data/submit_example_A202410152136.csv')
re['rating'] = re['rating'].round()
save_zip_file(re)

In [None]:
## 文本分析
df_unique_by_columns = train_data.drop_duplicates(subset=['product_name', 'product_id'])[['product_name', 'product_id']]
df_unique_by_columns.head(10000).sort_values('product_name')
product_name = df_unique_by_columns['product_name']
# display(product_name)
# 分析
# tokenized_data = [word_tokenize(sentence.lower()) for sentence in product_name]
# tokenized_data
# model = KeyedVectors.load_word2vec_format('path/to/glove.6B.100d.txt', binary=False)
# lengths = [len(s) for s in tokenized_data]
# max_len = int(np.percentile(lengths, 90)) #  lengths 列表中计算出覆盖 90% 样本的最大长度。 14


# def load_glove_embeddings(file_path):
#     embeddings_index = {}
#     with open(file_path, 'r', encoding='utf-8') as f:
#         for line in f:
#             values = line.split()
#             word = values[0]
#             embedding_vector = np.asarray(values[1:], dtype='float32')
#             embeddings_index[word] = embedding_vector
#     return embeddings_index

# # 分词并加载词嵌入
# def sentence_to_embedding(sentence, embeddings_index, embedding_dim=100):
#     words = word_tokenize(sentence.lower())  # 分词，并转化为小写
#     embeddings = []
#     for word in words:
#         if word in embeddings_index:
#             embeddings.extend(embeddings_index[word])
#         else:
#             embeddings.extend(np.zeros(embedding_dim))  # 若词不在 GloVe 中，则用零向量替代
#     return embeddings

# 为每一句话生成嵌入矩阵
tmp = pd.DataFrame()
tmp['product_name'] = train_data['product_name'].iloc[:10]
tmp['product_name_embeddings'] = tmp['product_name'].apply(lambda x: sentence_to_embedding(x, glove_embeddings))

# display(tmp[['product_name','product_name_embeddings']].head())
embeddings_padded = pad_sequences(tmp['product_name_embeddings'].tolist(), maxlen=14 * 100, dtype='float32', padding='post', truncating='post')
for i in range(14):
    cn = 'product_name_embeddings_' + str(i)
    tmp[cn] = embeddings_padded[:, i]
display(tmp.head())


# plt.figure(figsize=(10,6))
# plt.hist(lengths, bins=range(1, max(lengths) + 2), alpha=0.7)
# plt.title('Sentence Length Distribution')
# plt.xlabel('Length of Sentences (Number of Words)')
# plt.ylabel('Frequency')
# plt.xticks(range(1, max(lengths) + 1), rotation=45)
# plt.grid(axis='y', alpha=0.75)
# plt.show()