### Part 2, Pandas

In [None]:
# http://pandas-docs.github.io/pandas-docs-travis/api.html

In [1]:
import numpy as np
import pandas as pd

### Pandas Objects: Series, DataFrame

#### Series - one-dimensional array of indexed data

In [None]:
# pd.Series(data, index=index) - constructing Series objects

In [None]:
# data = pd.Series()

In [2]:
# print(data.values)
# print (type(data.values))

In [3]:
# data.index

In [None]:
# data with index = ('a','b','c','d')

In [1]:
# explicit and implicit index
# print(data['a'])
# data[0]

In [None]:
# Series from a Python dictionary

In [9]:
# population_dict = {'California': 38332521,
#                  'Texas': 26448193,
#                  'New York': 19651127,
#                  'Florida': 19552860,
#                  'Illinois': 12882135}
# population = pd.Series(population_dict)
# population

In [4]:
# explicit and implicit index
# print(population['California'])
# population[0]

#### DataFrame

In [None]:
# Constracting df from a single Series object

In [5]:
# pd.DataFrame(population, columns=['population'])

In [None]:
# Constracting df from a dictionary of Series objects

In [10]:
# area_dict = {'California': 423967, 'New York': 141297,
#           'Florida': 170312, 'Illinois': 149995}
#area = pd.Series(area_dict)
#area

In [11]:
#cities= pd.DataFrame({'population': population,
#             'area': area})
#cities

In [None]:
# Constracting df from a two-dimensional NumPy array

In [12]:
# np.random.seed(0)
# pd.DataFrame(np.random.rand(3, 2),
#            columns=['c1', 'c2'],
#            index=['r1', 'r2', 'r3'])

#### Data indexing and selecting

In [None]:
# NumPy - indexing (e.g., arr[2, 1]), slicing (e.g., arr[:, 1:5]), 
# masking (e.g., arr[arr > 0]), fancy indexing (e.g., arr[0, [1, 5]]), 
# and combinations thereof (e.g., arr[:, [1, 5]])

#### Indexing in Series

In [None]:
# explicit (явный) index, implicit (неявный) integer index

In [None]:
# Confusion: if Series has an explicit integer index, 
# an indexing operation such as data[1] will use the explicit indices, 
# while a slicing operation like data[1:3] will use the implicit index.

In [13]:
# x = pd.Series(['a', 'b', 'c'], index=[1, 3, 5])
# x

In [14]:
# x[1]

In [15]:
# x[1:3]

In [None]:
# Indexers: loc, iloc
# loc attribute - indexing and slicing with the explicit index
# iloc attribute - indexing and slicing with the implicit index

# Principle of Python code: "explicit is better than implicit"

In [None]:
# x[1]

In [None]:
# x.loc[1:3]

In [None]:
# When slicing with an explicit index, the final index is included in the slice, 
# while when slicing with an implicit index, the final index is excluded from the slice.

#### Indexing and selecting in df

In [17]:
# cities

In [None]:
# NaN ?

In [None]:
# Population of Florida?


In [None]:
# Column 'population"?

In [None]:
# df['column'], df.column

In [None]:
# Slice: rows from the beginning to 'Illinois',
# columns in a changed order - 'population', 'area'
# 2 ways

In [19]:
# Add new column: 'density' = 'population' / 'area'


In [17]:
# Filter rows with 'density' > 100 and select columns 'density' and 'population'

In [20]:
# cities.T


#### Handling missing data

In [None]:
# NaN: Missing numerical data

In [None]:
# Operating on Null Values: .isnull(), .notnull(), .dropna(), .fillna()

In [None]:
# Detecting null values

In [21]:
# data = pd.Series([1, np.nan, 'hello', None])
# data

In [23]:
# data.isnull()

In [22]:
# data[data.notnull()]

In [None]:
# Dropping null values

In [24]:
# Series
# data.dropna()

In [25]:
# DataFrame
#df = pd.DataFrame([[1,      np.nan, 2, np.nan],
#                 [2,      3,      5, np.nan],
#                 [np.nan, 4,      6, np.nan]])
#df

In [None]:
# axis='rows' or omitted - drop rows with any null value 
# axis='columns' - drop columns with any null value 

In [26]:
# df.dropna()

In [27]:
# df.dropna(axis='columns')

In [28]:
# how ='all'- will only drop rows/columns that are all null values
# df.dropna(axis='columns', how='all')

In [None]:
# thresh - a minimum number of non-null values for the row/column to be kept
# df.dropna(axis='rows', thresh=3)

In [None]:
# Filling null values: .fillna()

In [29]:
# data

In [30]:
# data.fillna(0)

In [32]:
# forward-fill: .fillna(method='ffill')
# data.

In [31]:
# back-fill: .fillna(method='bfill')
# data.

In [33]:
# df

In [34]:
# columns: axis = 0, rows: axis = 1 
# df.fillna(method='ffill', axis= 0)

#### MDB dataset

In [35]:
#titles = pd.read_csv('data/titles.csv')
#titles = titles[titles['year'] <= 2015]
#titles.head()

In [None]:
# Don't run !!!
# cast = pd.read_csv('data/cast_full.csv')
# cast = cast[cast['year'] <= 1990]
# cast.to_csv('data/cast.csv')

In [36]:
#cast = pd.read_csv('data/cast.csv')
#cast.head()

In [None]:
# .head(), .tail(), len()

In [37]:
# How many movies are listed in the titles dataframe?
# len(titles)

In [None]:
# What are the earliest three films listed in the titles dataframe?

In [38]:
# Sort_values for df: .sort_values('column')

In [None]:
# From a year to decade

In [None]:
# How many people have played an "Ophelia"?
# .unique()

In [None]:
# ready for exercise 1

In [None]:
# Operations with string: .str. (.str.startswith(), .str.len())

In [45]:
# h = cast.head(10)
# h

In [39]:
# What title in df h starts with 'For'?

In [40]:
# Lengh of titles in df h?

In [None]:
# How many times each item appears?
# .value_counts()

In [None]:
# # How many films were released each year?

In [3]:
# t

In [13]:
%matplotlib inline

In [42]:
# t.plot();

In [None]:
# What a problem with the plot? 
# t sorted by the frequency, not by the index - year

# .sort_index()

In [43]:
# c = cast
# c = c[c.character == 'Kermit the Frog']
# c.head()

In [44]:
#c.plot(x='year', y='n', kind ='scatter');

In [None]:
# ready for exercise 2

In [None]:
# .groupby()

In [None]:
# How many films have been released each decade in the history of cinema?
# .groupby(titles['year']//10*10).size()

In [46]:
# t = 


In [48]:
# plot the number of films: .plot(kind = 'bar')


In [49]:
# How many actors and actresses roles each year?


In [None]:
# ready for exercise 3

In [34]:
# How to compare actors and actresses?

In [50]:
# .unstack()
# .unstack('type').fillna(0) 
# .stack()

In [51]:
# u.plot();

In [52]:
# u.plot(kind='area');

In [53]:
# u.stack().head()

In [54]:
# Plot the difference between number of actors and actresses roles


In [None]:
# To a question in exercise 4  "Plot... fraction of 'actor' roles...": 
# (c.actor/(c.actor+c.actress)).plot(ylim=[0,1])

In [None]:
# ready for exercise 4