# 课题实践一 计算浏览次数最多的商品排行榜
## 加载原始数据并整理

In [6]:
import pandas as pd

events = pd.read_csv("./events.csv")
print(events.dtypes)

# 填充空值
events = events.fillna(0)

# 转换transactionid列为整型
events["transactionid"] = events["transactionid"].astype("int")
events["event"] = events["event"].astype("str")

# 增加一列，显示可读时间
events["action_time"] = pd.to_datetime(events["timestamp"], unit='ms')

# 展示开头的20个数据行
events.head(20)

timestamp          int64
visitorid          int64
event             object
itemid             int64
transactionid    float64
dtype: object


Unnamed: 0,timestamp,visitorid,event,itemid,transactionid,action_time
0,1433221332117,257597,view,355908,0,2015-06-02 05:02:12.117
1,1433224214164,992329,view,248676,0,2015-06-02 05:50:14.164
2,1433221999827,111016,view,318965,0,2015-06-02 05:13:19.827
3,1433221955914,483717,view,253185,0,2015-06-02 05:12:35.914
4,1433221337106,951259,view,367447,0,2015-06-02 05:02:17.106
5,1433224086234,972639,view,22556,0,2015-06-02 05:48:06.234
6,1433221923240,810725,view,443030,0,2015-06-02 05:12:03.240
7,1433223291897,794181,view,439202,0,2015-06-02 05:34:51.897
8,1433220899221,824915,view,428805,0,2015-06-02 04:54:59.221
9,1433221204592,339335,view,82389,0,2015-06-02 05:00:04.592


## 获取点击数据

In [7]:
import time

# 从日期字符串获取时间戳
def get_timestamp(dstr):
    timeArray = time.strptime(dstr, "%Y-%m-%d %H:%M:%S")
    timeStamp = int(time.mktime(timeArray)) * 1000
    return timeStamp
    
start_time_str = "2015-08-01 00:00:00"
start_timestamp = get_timestamp(start_time_str)
print(start_timestamp)

print(events.shape[0])

# 只保留有点击行为的记录，且取时间为2015-08-01 00:00:00以后的数据
events = events.loc[(events["event"] == "view") & (events["timestamp"] > start_timestamp)]
print(events.shape[0])

events = events[["visitorid", "itemid"]]
events = events.drop_duplicates()
print(events.shape[0])

events.head(10)

1438358400000
2756101
837025
682092


Unnamed: 0,visitorid,itemid
610621,1190029,338463
610622,1140512,184998
610623,1111168,456909
610624,938839,73918
610625,505655,409804
610626,581285,133215
610627,830739,82281
610628,1084627,457045
610629,206361,347025
610630,581285,144607


## 计算每个商品被浏览（点击）的次数

In [8]:
items = events.groupby("itemid").agg(count=("visitorid", "count")).reset_index()
items.head(20)

Unnamed: 0,itemid,count
0,3,2
1,4,2
2,6,5
3,16,4
4,19,10
5,22,1
6,25,8
7,26,2
8,29,5
9,32,20


## 按照被浏览（点击）次数排序

In [9]:
items = items.sort_values("count", ascending=False)
items.head(20)

Unnamed: 0,itemid,count
57617,187946,2876
67309,219512,818
141761,461686,675
58259,190000,623
133421,434782,618
117976,384302,573
98237,320130,525
5199,17114,509
29712,96924,458
139736,455183,447
