In [2]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import time
import re

# 视频信息

In [3]:
video = pd.read_csv('../data/video_related_data.csv')
video.shape

(4250105, 5)

In [4]:
video.head()

Unnamed: 0,item_id,duration,father_id,tag_list,cast
0,24403453.0,6.0,,50365080;50338575;50313222;50165986,
1,22838795.0,7.0,,50001708;50323515;50125414,
2,24181187.0,5.0,,50181045;50008354;50186586;50097060;50136213;5...,
3,23549051.0,,,,
4,20276917.0,8.0,,50286627;50257578;50415463,


In [5]:
video['duration'].min(),video['duration'].max()

(np.float64(1.0), np.float64(16.0))

## 缺失值与重复值

In [6]:
video.isnull().sum() # 缺失严重

item_id           66
duration      364078
father_id    2820142
tag_list      592337
cast         3775461
dtype: int64

In [7]:
video.drop_duplicates().shape[0]    # 无重复值

4250105

# 用户画像信息

In [8]:
user= pd.read_csv('../data/user_portrait_data.csv')
user.shape

(596906, 9)

In [9]:
user.head()
# id，设备类型，设备ram，设备rom，性别，年龄，教育信息，职业状态，邮编

Unnamed: 0,user_id,device_type,device_ram,device_rom,sex,age,education,occupation_status,territory_code
0,10209854,2.0,5731.0,109581,1.0,2.0,0.0,1.0,865101.0
1,10230057,2.0,1877.0,20888,1.0,4.0,0.0,1.0,864102.0
2,10194990,2.0,7593.0,235438,2.0,3.0,1.0,1.0,866540.0
3,10046058,2.0,,55137,1.0,4.0,0.0,1.0,
4,10290885,2.0,2816.0,52431,1.0,4.0,0.0,0.0,


## 缺失值与重复值

In [10]:
user['user_id'].unique().__len__()

596905

In [11]:
user.isnull().sum()

user_id                  0
device_type            339
device_ram           37772
device_rom           28434
sex                   6447
age                   7738
education            11003
occupation_status     7983
territory_code       37281
dtype: int64

## 缺失值填补

In [12]:
# 数据类型确定，除了ram和rom是object类型，其他都是数字类型
user.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 596906 entries, 0 to 596905
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   user_id            596906 non-null  int64  
 1   device_type        596567 non-null  float64
 2   device_ram         559134 non-null  object 
 3   device_rom         568472 non-null  object 
 4   sex                590459 non-null  float64
 5   age                589168 non-null  float64
 6   education          585903 non-null  float64
 7   occupation_status  588923 non-null  float64
 8   territory_code     559625 non-null  float64
dtypes: float64(6), int64(1), object(2)
memory usage: 41.0+ MB


In [13]:
# 离散型使用众数，连续型使用均值填充
user['age'].value_counts()
# 年龄已经划分称年龄段了

age
3.0    254250
4.0    201286
2.0     80789
5.0     37283
1.0     15560
Name: count, dtype: int64

In [14]:
# 所以每一个属性都填充众数即可
for column in user.columns:
    user[column] = user[column].fillna(user[column].value_counts().index[0])
# .mode() 会直接返回众数，但是对于ram和rom两个object类型就无法使用了

In [15]:
user.isnull().sum() # 填充完成

user_id              0
device_type          0
device_ram           0
device_rom           0
sex                  0
age                  0
education            0
occupation_status    0
territory_code       0
dtype: int64

In [16]:
# user['device_ram'].astype('float')
# 尝试将ram转换为float类型，发现报错：7625;3595，可能是用户使用多个设备登录，rom信息也是一样

In [17]:
user.loc[user["device_ram"].str.contains(';')]

Unnamed: 0,user_id,device_type,device_ram,device_rom,sex,age,education,occupation_status,territory_code
5289,10124806,2.0,7625;3595,55034;111304,2.0,3.0,0.0,1.0,862102.0
5732,10040742,2.0,7547;7567,111934,2.0,3.0,0.0,1.0,865307.0
7640,10534577,2.0,7396;7394,232167,2.0,4.0,2.0,1.0,862301.0
8921,10299809,2.0,3656;5664,110757;110069,1.0,2.0,0.0,1.0,864602.0
12574,10280999,2.0,7705;7690,112564,2.0,4.0,0.0,1.0,864101.0
...,...,...,...,...,...,...,...,...,...
584623,10172269,2.0,5734;11276,459930;102410,2.0,3.0,1.0,1.0,864110.0
585043,10263938,2.0,11388;3711,231151;53313,1.0,3.0,0.0,1.0,864414.0
587763,10268110,2.0,5666;5621,111289;227939,1.0,4.0,0.0,1.0,863208.0
591167,10053920,2.0,5666;5634,110022;111225,2.0,4.0,2.0,1.0,863201.0


In [18]:
user.loc[user["device_rom"].str.contains(';')]

Unnamed: 0,user_id,device_type,device_ram,device_rom,sex,age,education,occupation_status,territory_code
2766,10046846,2.0,3590,51872;52472,2.0,4.0,0.0,0.0,864512.0
5289,10124806,2.0,7625;3595,55034;111304,2.0,3.0,0.0,1.0,862102.0
7797,10207344,2.0,7625,228635;228835,1.0,3.0,0.0,1.0,863701.0
8921,10299809,2.0,3656;5664,110757;110069,1.0,2.0,0.0,1.0,864602.0
16212,10484382,2.0,2805,52170;52370,2.0,2.0,0.0,0.0,863416.0
...,...,...,...,...,...,...,...,...,...
586482,10371710,2.0,5725,53693;53673,1.0,4.0,1.0,1.0,864403.0
587763,10268110,2.0,5666;5621,111289;227939,1.0,4.0,0.0,1.0,863208.0
591167,10053920,2.0,5666;5634,110022;111225,2.0,4.0,2.0,1.0,863201.0
591547,10149577,2.0,1860,3760;10363,2.0,2.0,0.0,0.0,864313.0


多种设备登录的情况不相同，通过观察和思考，我们直接取用户ram和rom最高规格的那一个即可

In [19]:
def extract_max_value(value):
    # 按 ';' 拆分，并转换为整数列表
    parts = [int(x.replace('GB', '').strip()) for x in value.split(';')]
    # 返回最大值
    return max(parts)

In [29]:
mask_rom = user["device_rom"].str.contains(';', na=False)
mask_ram = user["device_ram"].str.contains(';', na=False)

user.loc[mask_rom, "device_rom"] = user.loc[mask_rom, "device_rom"].apply(extract_max_value)
user.loc[mask_ram, "device_ram"] = user.loc[mask_ram, "device_ram"].apply(extract_max_value)

In [31]:
user['device_rom']=user['device_rom'].astype('float')
user['device_ram']=user['device_ram'].astype('float')

In [32]:
user.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 596906 entries, 0 to 596905
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   user_id            596906 non-null  int64  
 1   device_type        596906 non-null  float64
 2   device_ram         596906 non-null  float64
 3   device_rom         596906 non-null  float64
 4   sex                596906 non-null  float64
 5   age                596906 non-null  float64
 6   education          596906 non-null  float64
 7   occupation_status  596906 non-null  float64
 8   territory_code     596906 non-null  float64
dtypes: float64(8), int64(1)
memory usage: 41.0 MB


In [36]:
# 可以验证功能正确
user.iloc[8921]

user_id              10299809.0
device_type                 2.0
device_ram               5664.0
device_rom             110757.0
sex                         1.0
age                         2.0
education                   0.0
occupation_status           1.0
territory_code         864602.0
Name: 8921, dtype: float64