Python for Data Analysis
----

# Book
- CN: 利用Python进行数据分析 78.4MB.pdf
- EN: Python for Data Analysis 2nd Edition.pdf

![cover](images/cover1.png)


# 概述
原书英文版，2013年由OReilly出版，中文版由机械工业出版社出版。

全书12个章节：
- 准备工作
- 引言
- IPython
- Number基础
- pandas入门
- 数据加载、存储与文件格式
- 数据规整化
- 绘图和可视化
- 数据聚合与分组运算
- 时间序列
- 金融和经济数据
- NumPy高级应用
- 附录
 - Python语言精要


# 源代码

 `git clone https://github.com/pydata/pydata-book -b 1st-edition`

# 读书笔记
## Ch01 准备工作
本章节如题，即为学习如何来用Python来分析数据的准备工作。
- what： 处理对象是什么？ 主要是结构化数据表格
- How： 工具是什么？ Python， NumPy/Matplotlib/IPython/pandas/SciPy
- Why： Python简单易用，有强大的公共库资源
- Setup: 准备代码调试编写环境 

## Ch02 引言
### JSON 数据集

In [None]:
#### Load JSON
import json
path = '/opt/Work/ML/pydata-book/ch02/usagov_bitly_data2012-03-16-1331923249.txt'
records = [json.loads(line) for line in open(path)]
records[0]

In [None]:
type(records[0])

#time_zones = [rec['tz'] for rec in records]
time_zones = [rec['tz'] for rec in records if 'tz' in rec]
time_zones[:5]

In [None]:
def get_counts(sequence):
    counts = {} # dict
    for x in sequence:
        if x in counts.keys():
            counts[x] += 1
        else:
            counts[x] = 1
    return counts
            
get_counts(time_zones)

In [None]:
from collections import defaultdict

def get_counts2(sequence):
    counts = defaultdict(int)
    for x in sequence:
        counts[x] += 1
    return counts

get_counts2(time_zones)

In [None]:
counts = get_counts2(time_zones)

In [None]:
type(counts) #collections.defaultdict
type(counts.items()) # dict_items

In [None]:
def top_counts(count_dict, n=10):
    value_key_pairs = [(count, tz) for tz, count in count_dict.items()]
    value_key_pairs.sort()
    return value_key_pairs[-n:]
counts = get_counts2(time_zones)
top_counts(counts)

In [None]:
import pandas as pd; import numpy as np
from pandas import DataFrame, Series
frame = DataFrame(records)

In [None]:
tz_counts = frame['tz'].value_counts()

In [None]:
clean_tz = frame['tz'].fillna('Missing')

In [None]:
type(clean_tz)

In [None]:
# NB!!!
clean_tz[clean_tz==''] = 'Unknow'

In [None]:
tz_counts = clean_tz.value_counts()
tz_counts[:10]

In [None]:
frame['a'].head()
frame.a.head()

In [None]:
results = Series(x.split()[0] for x in frame.a.dropna())
print(results.head(5))
print( results.value_counts()[:8] )

In [None]:
# ?????
cframe = frame[frame.a.notnull()]
operating_system = np.where(cframe['a'].str.contains('Windows'),
                            'Windows', 'Not Windows')

In [None]:
print( operating_system[:5])

In [None]:
by_tz_os = cframe.groupby(['tz', operating_system])
agg_counts = by_tz_os.size().unstack().fillna(0)
agg_counts[:10]

In [None]:
# 用于按照升序排列
indexer = agg_counts.sum(1).argsort()
indexer[:10]

In [None]:
count_subset = agg_counts.take(indexer)[-10:]
count_subset

In [None]:
count_subset.plot(kind='barh', stacked=True)

In [None]:
normed_subset = count_subset.div(count_subset.sum(1), axis=0)
normed_subset.plot(kind='barh',stacked=True)

### MovieLens 1M数据集


In [None]:
import pandas as pd
unames = ['user_id','gender', 'age', 'occupation', 'zip']
users = pd.read_table('ml-1m/users.dat', sep='::', header=None, names=unames)

rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table('ml-1m/ratings.dat', sep='::', header=None, names=rnames)

mnames = ['movie_id', 'title', 'genres']
movies=pd.read_table('ml-1m/movies.dat', sep='::', header=None, names=mnames)


In [None]:
users.head()

In [None]:
ratings.head()

In [None]:
movies.head()

In [None]:
data = pd.merge(pd.merge(ratings, users), movies)
data.head()

In [None]:
mean_ratings = data.pivot_table('rating', index='title',columns='gender', aggfunc=np.mean)
mean_ratings[:5]

In [None]:
ratings_by_title = data.groupby('title').size()
ratings_by_title[:10]

In [None]:
active_titles = ratings_by_title.index[ratings_by_title >= 250]
active_titles

In [None]:
mean_ratings = mean_ratings.ix[active_titles]
mean_ratings.head()

In [None]:
mean_ratings['diff'] = mean_ratings['M'] - mean_ratings['F']
sorted_by_diff = mean_ratings.sort_index(by='diff')
sorted_by_diff.head()

In [None]:
sorted_by_diff[::-1].head()

In [None]:
# 根据电影名称分组的得分数据的标准差
rating_std_by_title = data.groupby('title')['rating'].std()

# 根据active_titles进行过滤
rating_std_by_title = rating_std_by_title.ix[active_titles]

# 根据值对series进行降序排列
rating_std_by_title.sort_values(ascending=False).head()

### 全美婴儿姓名分析
#### 1880-2010年间全美婴儿姓名

In [None]:
! head -n 10 names/yob1881.txt

In [None]:
import pandas as pd
names1880 = pd.read_csv('names/yob1880.txt', names=['name', 'sex', 'births'] )
names1880.head()

In [None]:
names1880.groupby('sex')['births'].sum()

names1880.groupby('sex').births.sum()

In [None]:
# 1880-2010
years = range(1880, 2011)
pieces = []
columns = ['name', 'sex', 'births']

for year in years:
    path = 'names/yob%d.txt' % year
    frame = pd.read_csv(path, names=columns)
    
    frame['year'] = year
    pieces.append(frame)
    
names = pd.concat(pieces, ignore_index=True)
names.head()

In [None]:
names.groupby('sex')['births'].sum()

In [None]:
total_births = names.pivot_table('births', index='year', columns='sex', aggfunc=sum)
total_births.tail()

In [None]:
total_births.plot(title='Total births by sex and years')

In [None]:
def add_prop(group):
    births = group.births.astype(float)
    group['prop'] = births / births.sum()
    return group
names = names.groupby(['year', 'sex']).apply(add_prop)
names.head()

In [None]:
np.allclose(names.groupby(['year', 'sex']).prop.sum(), 1)

In [None]:
g=names.groupby(['year', 'sex'])
type(g)

 
 ''' [:1000] cannot work as desired '''

def get_top1000(group):
    return group.sort_index(by='births', ascending=False)[:1000]

grouped = names.groupby(['year', 'sex'])
top1000 = grouped.apply(get_top1000)
top1000.head()

 			name 	sex 	births 	year 	prop
year 	sex 						
1947 	F 	431022 	Linda 	F 	99651 	1947 	0.056229
1948 	F 	441381 	Linda 	F 	96185 	1948 	0.056657
1947 	M 	437125 	James 	M 	94601 	1947 	0.051768
1957 	M 	544528 	Michael 	M 	92700 	1957 	0.043008
1947 	M 	437126 	Robert 	M 	91557 	1947 	0.050102

def get_top1000(group):
    #return group.sort_index(by='births', ascending=False)[:1000]
    return group

grouped = names.groupby(['year', 'sex'])
top1000 = grouped.apply(get_top1000)
top1000.head()
top1000 = top1000.sort_index(by='births', ascending=False)[:1000]
top1000.head()

In [None]:
def get_top1000(group):
    return group.sort_index(by='births', ascending=False)[:1000]

grouped = names.groupby(['year', 'sex'],as_index=False)
top1000 = grouped.apply(get_top1000)
top1000.head()

#### 分析命名趋势

In [None]:
boys = top1000[top1000.sex == 'M']
girls= top1000[top1000.sex == 'F']

In [None]:
total_births = top1000.pivot_table('births', index='year', columns='name',aggfunc=sum)
#totle_births = names.pivot_table('births', index='year', columns='name',aggfunc=sum)
total_births.head()

In [None]:
subset = total_births[['John', 'Harry', 'Mary', 'Marilyn']]
#subset = total_births[['John', 'Mary']]
subset.plot(subplots=True, figsize=(12,10), grid=False, title="Number of births per year")

#### 评估命名多样性的增长

In [None]:
table        = top1000.pivot_table('prop', index='year', columns='sex',aggfunc=sum)
table.plot(title='Sum of the table1000.prop by year and sex', yticks=np.linspace(0, 1.2,13), xticks=range(1880,2020,10) )

In [None]:
boys[boys.year==1947].head()


In [None]:
df = boys[boys.year==1947]

prop_cumsum = df.sort_index(by='prop', ascending=False).prop.cumsum()
prop_cumsum.head(10)

In [None]:
prop_cumsum.searchsorted(0.5)

In [None]:
df = boys[boys.year == 1900]
in1900 = df.sort_index(by='prop', ascending=False).prop.cumsum()
in1900.searchsorted(0.5)+1

In [None]:
def get_quantile_count(group, q=0.5):
    group = group.sort_index(by='prop', ascending=False)
    return group.prop.cumsum().searchsorted(q)+1
diversity = top1000.groupby(['year', 'sex']).apply(get_quantile_count)
diversity = diversity.unstack('sex')
diversity.head()

In [None]:
diversity.plot(title='Number of polular names in top 50%')

#### 最后一个字母的变革

In [None]:
get_last_letter = lambda x: x[-1]
last_letters = names.name.map(get_last_letter)
last_letters.name = 'last_letters'
table = names.pivot_table('births', index=last_letters, columns=['sex', 'year'], aggfunc=sum)

In [None]:
# 我们选择代表性的三年
subtable = table.reindex(columns=[1910,1960,2010], level='year')
subtable.head()

In [None]:
subtable.sum()

In [None]:
letter_prop = subtable/subtable.sum().astype(float)

In [None]:
import matplotlib.pyplot as plt
fig, axes = plt.subplots(2,1, figsize=(10,8))
letter_prop['M'].plot(kind='bar', rot=0, ax=axes[0], title='Male')
letter_prop['F'].plot(kind='bar', rot=0, ax=axes[1], title='Female', legend=False)


In [None]:
# 选几个典型的字母 d n y
letter_prop = table/table.sum().astype(float)
dny_ts = letter_prop.ix[['d','n','y'], 'M'].T
dny_ts.head()

In [None]:
dny_ts.plot()

#### 男孩 ---> 女孩


In [None]:
all_names = top1000.name.unique()
mask = np.array(['lesl' in x.lower() for x in all_names])

In [None]:
all_names.shape

In [None]:
lesley_like = all_names[mask]
lesley_like

In [None]:
filtered = top1000[top1000.name.isin(lesley_like)]
filtered.groupby('name').births.sum()

In [None]:
table = filtered.pivot_table('births', index='year', columns='sex', aggfunc=sum)
table = table.div(table.sum(1), axis=0)
table.tail()

In [None]:
table.plot(style={'M': 'k-', 'F':'k--'})

#### 总结

此一章主要介绍了DF的用法，常规操作能解决很多问题
- 分组
- 统计
- 透视图
- 画图
    - pd.plot
    - matplotlib.pyplot.subplot
    
编写或者说抄写代码的时候才发现问题。比如，
+ 1. 区别：
    * groupby(['year', 'sex'])
    * groupby(['year', 'sex'], asindex=False)
+ 2. 取列：
    * names.year
    * names['year']
+ 3. 筛选
    * names[names.year==1880]
+ 4. 文件
    * [ json.loads(line) for l in open('some/file/path') ]

## IPython

## Number基础

#### ndarray: 一种多维数组对象


In [None]:
import numpy as np
data = np.random.rand(2,3)

In [None]:
data
data*10
data.shape
data.dtype

In [None]:
data1 = [6, 7.5, 8, 0, 1]
arr1 = np.array(data1)
arr1
arr1.shape

In [None]:

data2 = [[1,2,3,4], [5,6,7,8]]
arr2 = np.array(data2) 
arr2.ndim  # 2
arr2.shape # (2,4)
arr2.dtype #int64

In [None]:
np.zeros((3,6))

In [None]:
np.empty((2,3,2))

In [None]:
np.arange(15)

In [None]:
arr1 = np.array([1,2,3], dtype=np.float64)
arr2 = np.array([1,2,3], dtype=np.int32)

In [None]:
arr1.dtype

In [None]:
arr2.dtype

In [None]:
arr = np.array([1,2,3,4,5])
arr.dtype

In [None]:
float_arr = arr.astype(np.float64)
float_arr.dtype

In [None]:
numberic_strings = np.array(['1.25','-9.6', '42'], dtype=np.string_)
numberic_strings.astype(float)

## pandas入门


## 数据加载、存储与文件格式

## 数据规整化

## 绘图和可视化

## 数据聚合与分组运算

## 时间序列

## 金融和经济数据

## NumPy高级应用