# 观察数据形式并对数据进行预处理
1. [Foursquare数据](https://sites.google.com/site/xueatalphabeta/academic-projects)已经下载好了。[本地路径](../../data/ml-1m/)
2. [MovieLens](http://grouplens.org/datasets/movielens/1m/)已经下载好了。[本地路径](../../data/Foursqare/)

In [1]:
import numpy as np
import pandas as pd


# Foursquare需要处理的位置
1. 需要将题头与数据的分隔符去掉（手动）。
2. 需要将结尾的字数统计去掉（手动）(1021966 rows)。
3. 需要将题头中的空格去掉（手动）。
4. 需要指定读取时的分隔符为"|"。
5. 需要将数据中的空格去掉。
   1. id、user_id、venue_id通过设置字段来解决。
   2. latitude和longitude通过下面一个步骤来实现。
   3. created_at不处理空格。
6. 经纬度数据的处理。
   1. 中的空格部分替换为''。
   2. 然后将经纬度中为''的替换为'0'。
   3. 然后在将其转换为数值类型。
      1. 使用这种形式的转换不行checkins_dataset['latitude'].apply(pd.to_numeric)。转换之后再看数据类型并没有变化。使用checkins_dataset[['latitude', 'longitude']] = checkins_dataset[['latitude', 'longitude']].astype(np.float64)这种转换可以。
7. created_at如果需要转化为datetime类型需要耗时2分11.2秒。同样的使用checkins_dataset[['created_at']] = checkins_dataset[['created_at']].astype(np.datetime64)来完成转换。使用这种转换checkins_dataset['created_at'].apply(pd.to_datetime)不仅再次查看的时候没有将类型转换完成，而且还非常耗时。


In [2]:
checkins_dataset = pd.read_csv("../../data/Foursqare/checkins.dat",sep="|", 
                               dtype={'id': np.int32, 
                                      'user_id': np.int32, 
                                      'venue_id': np.int32, 
                                      'latitude':object, 
                                      'longitude': object, 
                                      'created_at': object})

In [3]:
checkins_dataset.describe()

Unnamed: 0,id,user_id,venue_id
count,1021966.0,1021966.0,1021966.0
mean,510998.5,1227084.0,150267.2
std,295016.3,663805.4,244008.8
min,16.0,1.0,1.0
25%,255507.2,609791.0,9310.0
50%,510998.5,1380121.0,32163.0
75%,766489.8,1775918.0,173298.0
max,1021981.0,2153502.0,1143020.0


In [4]:
checkins_dataset.head(5)
checkins_dataset.columns

Index(['id', 'user_id', 'venue_id', 'latitude', 'longitude', 'created_at'], dtype='object')

In [5]:
checkins_dataset.dtypes

id             int32
user_id        int32
venue_id       int32
latitude      object
longitude     object
created_at    object
dtype: object

In [6]:
checkins_dataset['latitude'].replace('\s+','',regex=True,inplace=True) 
checkins_dataset['longitude'].replace('\s+','',regex=True,inplace=True) 
checkins_dataset.head(5)

Unnamed: 0,id,user_id,venue_id,latitude,longitude,created_at
0,984301,2041916,5222,,,2012-04-21 17:39:01
1,984222,15824,5222,38.8951118,-77.0363658,2012-04-21 17:43:47
2,984315,1764391,5222,,,2012-04-21 17:37:18
3,984234,44652,5222,33.800745,-84.41052,2012-04-21 17:43:43
4,984249,2146840,5222,,,2012-04-21 17:42:58


In [7]:
checkins_dataset['latitude'].replace('','0',regex=True,inplace=True) 
checkins_dataset['longitude'].replace('','0',regex=True,inplace=True) 
checkins_dataset.head(5)

Unnamed: 0,id,user_id,venue_id,latitude,longitude,created_at
0,984301,2041916,5222,0.0,0.0,2012-04-21 17:39:01
1,984222,15824,5222,38.8951118,-77.0363658,2012-04-21 17:43:47
2,984315,1764391,5222,0.0,0.0,2012-04-21 17:37:18
3,984234,44652,5222,33.800745,-84.41052,2012-04-21 17:43:43
4,984249,2146840,5222,0.0,0.0,2012-04-21 17:42:58


In [12]:
checkins_dataset.dtypes

id             int32
user_id        int32
venue_id       int32
latitude      object
longitude     object
created_at    object
dtype: object

In [18]:
checkins_dataset[['latitude', 'longitude']] = checkins_dataset[['latitude', 'longitude']].astype(np.float64)
checkins_dataset[['created_at']] = checkins_dataset[['created_at']].astype(np.datetime64)

In [20]:
checkins_dataset.dtypes

id                     int32
user_id                int32
venue_id               int32
latitude             float64
longitude            float64
created_at    datetime64[ns]
dtype: object

In [19]:
checkins_dataset.describe()

Unnamed: 0,id,user_id,venue_id,latitude,longitude
count,1021966.0,1021966.0,1021966.0,1021966.0,1021966.0
mean,510998.5,1227084.0,150267.2,14.3977,-33.51477
std,295016.3,663805.4,244008.8,18.88728,48.27883
min,16.0,1.0,1.0,-75.25097,-159.6708
25%,255507.2,609791.0,9310.0,0.0,-77.86
50%,510998.5,1380121.0,32163.0,0.0,0.0
75%,766489.8,1775918.0,173298.0,36.11465,0.0
max,1021981.0,2153502.0,1143020.0,78.21859,178.4242


In [21]:
checkins_dataset_dropzero = checkins_dataset.drop(checkins_dataset[(checkins_dataset['latitude']==0) | (checkins_dataset['longitude']==0)].index)

In [23]:
checkins_dataset_dropzero.describe()

Unnamed: 0,id,user_id,venue_id,latitude,longitude
count,396634.0,396634.0,396634.0,396634.0,396634.0
mean,510471.1,564134.9,132755.9,37.09707,-86.354065
std,305492.7,486569.4,228700.3,8.77876,37.983677
min,16.0,1.0,1.0,-75.250973,-159.670833
25%,242279.2,169290.0,7620.0,33.800745,-111.926052
50%,501987.5,429820.0,28304.0,39.099275,-85.758456
75%,775817.5,849647.0,148552.0,40.802071,-74.05653
max,1021981.0,2153361.0,1143011.0,78.21859,178.42424


In [24]:
checkins_dataset_dropzero.head(5)

Unnamed: 0,id,user_id,venue_id,latitude,longitude,created_at
1,984222,15824,5222,38.895112,-77.036366,2012-04-21 17:43:47
3,984234,44652,5222,33.800745,-84.41052,2012-04-21 17:43:43
7,984291,105054,5222,45.523452,-122.676207,2012-04-21 17:39:22
9,984318,2146539,5222,40.764462,-111.904565,2012-04-21 17:35:46
10,984232,93870,380645,33.448377,-112.074037,2012-04-21 17:38:18


In [14]:
checkins_dataset[checkins_dataset['id']==984301]

Unnamed: 0,id,user_id,venue_id,latitude,longitude,created_at
0,984301,2041916,5222,0,0,2012-04-21 17:39:01
