#  Pandas 

In [1]:
# Pandas is a powerful Python Data analysis toolkit.
# It is open source 
# A fast and efficient DataFrame object for data manipulation 
# Reading and writing data structure and different formats : csv, tsv, txt, XML, json, zip etc

In [2]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

###  pandas version

In [3]:
pd.__version__

'1.0.5'

### Series  # series is one dimensional index array

In [4]:
list_s = [1, 2, -3, 6.2, "data values"]

In [5]:
s1 = pd.Series(list_s)
print(s1)

0              1
1              2
2             -3
3            6.2
4    data values
dtype: object


In [6]:
type(s1)

pandas.core.series.Series

In [7]:
s2 = pd.Series([1,2,3,4])
print(s2)

0    1
1    2
2    3
3    4
dtype: int64


In [8]:
empty_s = pd.Series([])
print(empty_s)

Series([], dtype: float64)


In [9]:
s3 = pd.Series([1,2,3,4], index = ["A", "B", "C", "D"])
print(s3)

A    1
B    2
C    3
D    4
dtype: int64


In [10]:
s4 = pd.Series([1,2,3,4], index = ["A", "B", "C", "D"], dtype = float)
print(s4)

A    1.0
B    2.0
C    3.0
D    4.0
dtype: float64


In [11]:
s5 = pd.Series([1,2,3,4], index = ["A", "B", "C", "D"], dtype = float, name = "data values")
print(s5)

A    1.0
B    2.0
C    3.0
D    4.0
Name: data values, dtype: float64


In [12]:
scaler = pd.Series(0.5)
print(scaler)

0    0.5
dtype: float64


In [13]:
scaler = pd.Series(0.5, index = [1,2,3])
print(scaler)

1    0.5
2    0.5
3    0.5
dtype: float64


In [14]:
dict_s = pd.Series({"a":1, "b":2})
print(dict_s)

a    1
b    2
dtype: int64


### Series Operator

In [15]:
s6 = pd.Series([1,2,3,4,5,6])
print(s6)

0    1
1    2
2    3
3    4
4    5
5    6
dtype: int64


In [16]:
s6[0]

1

In [17]:
s6[5]

6

In [18]:
s6[2:4]

2    3
3    4
dtype: int64

In [19]:
max(s6)

6

In [20]:
min(s6)

1

In [21]:
s6[s6>3]

3    4
4    5
5    6
dtype: int64

In [22]:
s7 = pd.Series([1,2,3,4,5,6])
print(s7)

0    1
1    2
2    3
3    4
4    5
5    6
dtype: int64


In [23]:
s6 + s7

0     2
1     4
2     6
3     8
4    10
5    12
dtype: int64

In [24]:
s8 = pd.Series([1,2,3])
print(s8)

0    1
1    2
2    3
dtype: int64


In [25]:
s6 + s8

0    2.0
1    4.0
2    6.0
3    NaN
4    NaN
5    NaN
dtype: float64

###  DataFrame 

### Pandas DataFrame is two-dimensional, size-mutable, potentially heterogeneous tabular data structure with labeled axes (rows & column)

In [26]:
df1 = pd.DataFrame()
print(df1)

Empty DataFrame
Columns: []
Index: []


In [27]:
lst = ["a", "b", "c", "d"]
df2 = pd.DataFrame(lst)
print(df2)

   0
0  a
1  b
2  c
3  d


In [None]:
df2

In [None]:
ls = [[1, 2, 3], [2, 3, 4], [3, 4, 5]]
df3 = pd.DataFrame(ls)

In [None]:
df3

In [None]:
dict1 = {"Id" : [11, 12, 13, 14]}
df4 = pd.DataFrame(dict1)
df4

In [None]:
dict1 = {"Id" : [11, 12, 13, 14], "SN" : [1, 2, 3, 4]}   ## array len must be same
df5 = pd.DataFrame(dict1)
df5

In [None]:
ls_dict = [{"a" : 1, "b" : 2}, {"a" : 1, "b" : 2}]
df6 = pd.DataFrame(ls_dict)
df6

In [None]:
ls_dict = [{"a" : 1, "b" : 2}, {"a" : 1, "b" : 2, "c" : 3}]
df7 = pd.DataFrame(ls_dict)
df7

In [None]:
dict_sr = {"Id" : pd.Series([1,2,3]), "SN" : pd.Series([111, 222, 333])}
df8 = pd.DataFrame(dict_sr)
df8

### read csv file

In [None]:
df = pd.read_csv("F:\Vaibhav\Python\.ipynb_checkpoints\Data science\Analysis\Kaggle Dataset\matches.csv")

In [None]:
df.head()  # to show 5 first row

In [None]:
import os 

In [None]:
print(os.getcwd())  # to find cwd and where it store

In [None]:
type(df)

In [None]:
df.columns

### nrows

In [None]:
df = pd.read_csv("F:\Vaibhav\Python\.ipynb_checkpoints\Data science\Analysis\Kaggle Dataset\matches.csv", nrows = 1)
df

In [None]:
df = pd.read_csv("F:\Vaibhav\Python\.ipynb_checkpoints\Data science\Analysis\Kaggle Dataset\matches.csv", nrows = 5)
df

###  usecols

In [None]:
df = pd.read_csv("F:\Vaibhav\Python\.ipynb_checkpoints\Data science\Analysis\Kaggle Dataset\matches.csv", usecols = [0])
df

In [None]:
df = pd.read_csv("F:\Vaibhav\Python\.ipynb_checkpoints\Data science\Analysis\Kaggle Dataset\matches.csv", usecols = [0,1])
df

### skiprows

In [None]:
df = pd.read_csv("F:\Vaibhav\Python\.ipynb_checkpoints\Data science\Analysis\Kaggle Dataset\matches.csv", skiprows = 1) 
### to skip rows from top
df

In [None]:
df = pd.read_csv("F:\Vaibhav\Python\.ipynb_checkpoints\Data science\Analysis\Kaggle Dataset\matches.csv", skiprows = 2) 
### to skip rows from top
df

In [None]:
df = pd.read_csv("F:\Vaibhav\Python\.ipynb_checkpoints\Data science\Analysis\Kaggle Dataset\matches.csv", skiprows = [0]) 
### to skip rows from top by index
df

### index_col

In [None]:
df = pd.read_csv("F:\Vaibhav\Python\.ipynb_checkpoints\Data science\Analysis\Kaggle Dataset\matches.csv", index_col = "id") 
# change index 
df

### header 

In [None]:
df = pd.read_csv("F:\Vaibhav\Python\.ipynb_checkpoints\Data science\Analysis\Kaggle Dataset\matches.csv", header= 1) 
df

In [None]:
df = pd.read_csv("F:\Vaibhav\Python\.ipynb_checkpoints\Data science\Analysis\Kaggle Dataset\matches.csv", header = None) 
df

### prefix 

In [None]:
df = pd.read_csv("F:\Vaibhav\Python\.ipynb_checkpoints\Data science\Analysis\Kaggle Dataset\matches.csv", prefix = "Columns", header = None) 
df

### names

In [None]:
df = pd.read_csv("F:\Vaibhav\Python\.ipynb_checkpoints\Data science\Analysis\Kaggle Dataset\matches.csv", names = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18]) 
df

### head

In [None]:
data = pd.read_csv('matches.csv')
data.head()       # by default print 5 first rows

In [None]:
data.head(6)

### tail

In [None]:
data.tail() # by default print last 5 rows

In [None]:
data.tail(6)

### dtype

In [None]:
data = pd.read_csv('matches.csv', dtype = {'id' : 'float'})
data

In [None]:
data = pd.read_csv('matches.csv', dtype = {'id' : 'float', 'win_by_runs' : 'float'})
data

###  true_values , false_values

In [None]:
data = pd.read_csv('matches.csv', true_values = ['Yes'], false_values= ['No'])

In [None]:
data