# Python Tutorial - Numpy, Pandas & Matplotlib

## 1. Numpy (a.k.a NUMeric PYthon)

### - BASICS

In [None]:
# Import numpy as np (so instead of writing numpy all the time, we write np)
import numpy as np                                   

# Basic numpy object: An 1D array (ONLY SAME TYPE OF ELEMENTS OTHERWISE EVERYTHING WILL TURN INTO STRINGS BY COERSION)
# BE CAREFUL: Basic calculations ALWAYS elementwise in arrays (NOT POSSIBLE IN LISTS). Also elementwise between arrays!     
foo = np.array([1,2,3,4])                           

# A 2D array 
foo2 = np.array([[1,2,3,4],                          
                 [5,6,7,8]])                         

# The i'th element of the np.array. The first one is the 0th
foo[i]                                          

# The i'th element from the end of the np.array. The first (from the end) is 1
foo[-i]                                              

# Elements of the np.array from i up to j-1.
foo[i:j]                                             

# When something is missing, it means from the beggining (or the end). If both are missing prints the whole np.array
foo[:j], foo[i:], foo[:]                             

# Uses the condition in order to print only the right elements from the ARRAY
foo[foo>2]                                           

# Gives back all the positions of foo that are bigger than 2
where(foo>2)                                         

# For nD array, i picks the row ie: list, j picks the column ie: element of the list. Applies to all commands
foo[i,j] 

# An attribute of the array: Shape: (#elements, #dimensions)
foo.shape                                            

# Shows the type of entries inside the array
foo.dtype

# Shows how many items are inside an array
foo.size

# Shows the dimensions of an array
foo.ndim                                             

### - VECTORS

In [6]:
# Create a random column vector
a = np.random.rand(20,1)

# Takes the exponential of every element (works with all functions available in numpy)
exp_a = np.exp(a)

# Find the dot products
value = np.dot(a.T,a)
matrix = np.dot(a,a.T)

## 2. Pandas (Built on top of Numpy and uses Matplotlib)

### - SERIES

In [None]:
# Import pandas as pd (so instead of writing pandas all the time, we write pd)                                 
import pandas as pd

In [None]:
# A Series. Since we have no keys, pandas use 0,1,2 for the index of each element
foo = pd.Series([34,'tasos'])

# Here we use keyes for the indices.
foo = pd.Series({'age':34, 'name':'tasos'})           
                                                      
# STANDAR PYTHON SLICING FOR SERIES: foo[0], foo['age'], etc...
# STANDAR PYTHON ACTIONS FOR SERIES: add lists, add to elements, etc...

### - DATAFRAME (Series combined together)

In [None]:
# Dataframe: Every dictionary a new row. Each key of dictionary a column
df = pd.DataFrame([{'age':34, 'name':'tasos'},         
                   {'age':43, 'name':'georgia'}])                        

# Load data as Datafrane
df = pd.read_table('path',                                 
                   sep = ',',                   # Defines the seperator. Here is comma. Default of read_table: tab 
                   header = None,               # Define if there is a header or not. Here no header. Default of read_table: True. I can use a number to specify which row is the index      
                   names = ['col1','col2'],     # Gives names to the columns from the list. MUST USE header = 0 for this one to work
                   index = 'col1'               # Specify which column is the index. If none, creates an index by itself
                   skiprows = 12,               # Line numbers to skip (0-indexed) or number of lines to skip (int) at the start of the file
                   skipfooter = 12,             # Number of lines at bottom of file to skip (Unsupported with engine=’c’)
                   usecols = [0,4]              # Choose specific columns from a file. Works also with name of the files
                   nrows = 12)                  # Read only the first 12 rows
                      
# Saves the dataframe as a csv file named 'name' in the working directory
pd.DataFrame().to_csv('name')                         


# Selects the Column named 'col1' from Dataframe df. This object is a pandas series.
# Must be used if there is a space in column name
# Must be used if I want to define a new column
# If the entries of the series are strings, we can do all the things we can do with strings. FIRST USE str. AND THEN THE METHOD OF A STRING
df['col1']                                            
                                                                                                         
# Selects the Column named 'col1' from Dataframe df. This object is a pandas series.
# If there is a space in column name, it cannot be used
# It cannot be used for creating a new column
# If the entries of the series are strings, we can do all the things we can do with strings
df.col1                                               
                                                      
# Select Rows that satisfy the condition: df.col1 = 'hey'. Works for any condition that we can think.
# For conditions: ==, !=, <, >, <=, >=
# For multiple conditions: & (and), | (or)
df[df.col1 = 'hey']                                   
                                                      
# For too many or's we can use the isin trick                                                    
df[df.col1.isin(['hey', 'hey2'])]                    

# Picks specific values using lables: index for rows, column name for columns
# Use : or lists for many. We can also use conditions for rows
df.loc['Greece', 'col1']                              
                                                      
# Same thing, picks specific values using positions by integers. Counts from the begining 
df.iloc[12,1]                                         

# Same thing, picks specific values using both integers and column name. Best choice to be ok
df.ix[12, 'col1']                                     

### - BASIC METHODS & ATTRIBUTES OF DATAFRAMES

In [None]:
# A tupple: (Rows, Columns)
df.shape                                              

# A list with the names of columns of df. Since they are strings, we can use all string methods
df.columns

# Informations on the index of the dataframe
df.index 

# Renames the columns from names in list. MUST SPECIFY ALL OF THEM
df.columns = ['a','b']

# Shows the type of the values of each column in df
df.dtypes                                             

# Use column col1 as an index. BE CAREFUL: Not count as a column now. We can select indices only by using ix
df.set_index('col1')

# Puts the index back as a column, and then creates an abstract index
df.reset_index()                                      

# General info about df. memory_usage='deep' gives a very accurate description of memory usage
df.info(memory_usage='deep') 

# Show the 10 first rows of df. Default value = 5
df.head(10) 

# Show the 10 last rows of df. Default value = 5
df.tail(10)

# A dataframe containing all basic statistics as max, min, mean etc... per column
df.describe() 

# Changes the df to any type we'll specify. Works with columns also. Works also with category
df.astype()  

# Shows the unique values of column col1. Returns a pandas series
df.col1.unique()

# Shows the number of unique values of column col1. Returns a pandas series
df.col1.nunique() 

# Works also with columns of a dataframe i.e with pandas series
df.col1.describe()  

# Shows how many times unique values of col1 appear. 
# normalize = True, gives percentages instead of pure counts
df.col1.value_counts(normalize = True)                

# Groups the data w.r.t to col1. Now col1 turns to an extra index. After command we can choose max, min, mean etc.
df.groupby('col1')                                    

# Selects a lot of statistics in specific group.
df.groupby('col1').agg(['min', 'max', 'mean']).       

# Deletes from df the column named 'col1'. We can use index of column. Eg: 1. axis=1 means columns. inplace=True means do it in the original dataframe
df.drop('col1', axis=1, inplace=True)                 

# Deletes from df the rows with index 1 and 2. axis=0 means rows. inplace=True means do it in the original dataframe
df.drop([1,2], axis=0, inplace=True)                  

# Renames columns of dataframe. Dictionary: Key = Old Name, Value = New Name.
# NO NEED TO SPECIFY ALL OF THEM. Just use the columns you want to change.
df.rename({'old': 'new'})                             
                                                      
# Returns the df sorted by the values of column 'col1'. USE A LIST OF COLUMNS FOR MULTIPLE SORTING
df.sort_values('col1')                                

# Sorts the rows of column 'col1'. Returns a pandas series. By default ascending = True. DOES NOT CHANGE THE df.col1
df.col1.sort_values(ascending = False)                

# Changes the type of the col1 to float. Works with all types. object means string
df.col1.astype(float)                                 

# iterrows(): A way for iterrate through the dataframe. 
for index, row in df.iterrows():                      
    print(index, row.col1, row.col2)

### - TIMESERIES

In [None]:
# Timestamp
ts = pd.Timestamp('2016 Jul 1 10:00:00')

# Dates (Works with: '2016 Jul 1', '7/1/2016', '1/7/2016', 'July 1, 2016', '2016-07-01', '2016/07/01')
dt = pd.date_range('2016 Jul 1 10:00:00', periods = 10, freq = 'D')
dt = pd.date_range('2016 Jul 1 10:00:00', '2016 Jul 10 10:00:00', freq = 'D')

## 3. Matplotlib

In [None]:
# Import matplotlib.pyplot as plt (so instead of writing matplotlib.pyplot all the time, we write plt)                                 
import matplotlib.pyplot as plt 

# Shows the plot that we created. Always at the end of the code
plt.show()                                                   

# Arranges the details of the graph
plt.figure(figsize = (1,1))                 # Sets the size of the figure)                                         # 

# Basic plot of x and y. 
plt.plot(x, y,                                               
         label = 'First Plot',              # The label that characterizes this plot
         color = 'red',                     # The color of the line
         linewidth = 1.0,                   # The width of the line, measured in pixels    
         linestyle = '-')                   # The style of the line, i.e what symbol will be used in order to create the graph                      

# Another plot (on the same picture). Use another label for distinction
plt.plot(x2, y2, label = 'Second Plot')                      

# Bar Plot
plt.bar(x3, y3, label = 'Bar Plot')

# Histogram Plot of x (Measures the frequency of entries in x)
plt.hist(x,
         bins = 30)                          # Number of bins on the plot. We can also give a list of specific bins

# Scatter Plot
plt.scatter(x3, y3, label = 'Scatter Plot')

# Stack Plot: One x-axis a lot of data in y axis.
plt.stackplot(x, y1,y2,y3, colors = [])

# Pie Chart: One x axis, shows percentage as coverage
plt.pie(x, labels=[], colors=[],
        startangle=90                         # Starting angle (here 90 degrees)
        shadow = True                         # Adds a shadow
        explode = (0, 0.1, 0)                 # Explodes slices of pie. Define which and how in the parenthesis. (here only the second one by 0.1)
        autopct = '%1.1f%%'                   # Adds percentages on the slices of the pie

# Title of plot
plt.title('Graph Title\nSecond Line')
        
# Name of x-axis
plt.xlabel('x-axis')
        
# Name of y-axis       
plt.ylabel('y-axis')                                         

# Uses the labels of plots and creates a legend.       
plt.legend(loc = 'upper left',                  # Sets the position of the legend.
           frameon = False)                     # Sets if there is a frame covering legend or not             

# The limits of x-axis. First list picks points, second list gives specific names
plt.xlim([-1.0, 1.0],['hey', 'hey'])  
        
# Choose specific points to show on x-axis, not everything
plt.xticks(np.linspace(1, endpoint = True)) 
        
# The limits of y-axis. First list picks points, second list gives specific names
plt.ylim([-1.0, 1.0], ['hey', 'hey'])  
        
# Choose specific points to show on y-axis, not everything
plt.yticks(np.linspace(1, endpoint = True))                  

# Start manipulating the axes
ax = plt.gca() 
        
# Manipulating spines (There are 4: top, bottom, right, left)
ax.spines['top']  
        
# Setting the color. By None we just vasish it
ax.spines['top'].set_color('None')
        
# In which spine will the ticks appear on x-axis
ax.xaxis.set_ticks_position('bottom')
        
# Specifies the position of spines. The tuple says, 0 w.r.t to the data
ax.spines['top'].set_position(('data',0))                    