# 10 minutes to pandas

In [1]:
import numpy as np
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


## Creating a Series by passing a list of values, letting pandas create a default RangeIndex.

In [2]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])

s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

## Creating a DataFrame by passing a NumPy array with a datetime index using date_range() and labeled columns:

In [3]:
dates = pd.date_range("20130101", periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

## Creating a DataFrame by passing a NumPy array with a datetime index using date_range() and labeled columns:

In [4]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))
df

Unnamed: 0,A,B,C,D
2013-01-01,1.412973,-0.853081,1.825165,0.175898
2013-01-02,0.101473,0.580295,0.689579,-1.195897
2013-01-03,-0.97227,0.379646,1.546685,-1.189686
2013-01-04,-3.063867,1.25456,-0.011729,0.0992
2013-01-05,-0.248812,-2.930495,-1.988168,-1.67397
2013-01-06,-0.163853,-0.512081,0.791476,0.952886


## Creating a DataFrame by passing a dictionary of objects where the keys are the column labels and the values are the column values.

In [5]:
df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
    }
)
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


## The columns of the resulting DataFrame have different dtypes:

In [6]:
df2.dtypes

A          float64
B    datetime64[s]
C          float32
D            int32
E         category
F           object
dtype: object

In [None]:
# If you’re using IPython, tab completion for column names (as well as public attributes) is automatically enabled.
# Here’s a subset of the attributes that will be completed:

In [9]:
df2.dtypes
or
df2.info()

A          float64
B    datetime64[s]
C          float32
D            int32
E         category
F           object
dtype: object

In [10]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, 0 to 3
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype        
---  ------  --------------  -----        
 0   A       4 non-null      float64      
 1   B       4 non-null      datetime64[s]
 2   C       4 non-null      float32      
 3   D       4 non-null      int32        
 4   E       4 non-null      category     
 5   F       4 non-null      object       
dtypes: category(1), datetime64[s](1), float32(1), float64(1), int32(1), object(1)
memory usage: 288.0+ bytes


## Viewing data

In [11]:
df2.head()

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [12]:
df2.tail()

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [13]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [14]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [15]:
df.to_numpy()

array([[ 1.41297319, -0.85308107,  1.82516512,  0.17589814],
       [ 0.10147294,  0.58029452,  0.68957912, -1.19589676],
       [-0.97226994,  0.37964579,  1.54668491, -1.18968638],
       [-3.06386654,  1.2545601 , -0.01172944,  0.0992005 ],
       [-0.24881208, -2.93049486, -1.98816765, -1.67396979],
       [-0.1638528 , -0.51208132,  0.7914764 ,  0.9528862 ]])

In [16]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.489059,-0.346859,0.475501,-0.471928
std,1.482772,1.477207,1.372717,1.025688
min,-3.063867,-2.930495,-1.988168,-1.67397
25%,-0.791405,-0.767831,0.163598,-1.194344
50%,-0.206332,-0.066218,0.740528,-0.545243
75%,0.035142,0.530132,1.357883,0.156724
max,1.412973,1.25456,1.825165,0.952886


In [17]:
df

Unnamed: 0,A,B,C,D
2013-01-01,1.412973,-0.853081,1.825165,0.175898
2013-01-02,0.101473,0.580295,0.689579,-1.195897
2013-01-03,-0.97227,0.379646,1.546685,-1.189686
2013-01-04,-3.063867,1.25456,-0.011729,0.0992
2013-01-05,-0.248812,-2.930495,-1.988168,-1.67397
2013-01-06,-0.163853,-0.512081,0.791476,0.952886


In [18]:
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,1.412973,0.101473,-0.97227,-3.063867,-0.248812,-0.163853
B,-0.853081,0.580295,0.379646,1.25456,-2.930495,-0.512081
C,1.825165,0.689579,1.546685,-0.011729,-1.988168,0.791476
D,0.175898,-1.195897,-1.189686,0.0992,-1.67397,0.952886


In [19]:
df

Unnamed: 0,A,B,C,D
2013-01-01,1.412973,-0.853081,1.825165,0.175898
2013-01-02,0.101473,0.580295,0.689579,-1.195897
2013-01-03,-0.97227,0.379646,1.546685,-1.189686
2013-01-04,-3.063867,1.25456,-0.011729,0.0992
2013-01-05,-0.248812,-2.930495,-1.988168,-1.67397
2013-01-06,-0.163853,-0.512081,0.791476,0.952886


In [22]:
df.sort_index(axis= 1, ascending= False)

Unnamed: 0,D,C,B,A
2013-01-01,0.175898,1.825165,-0.853081,1.412973
2013-01-02,-1.195897,0.689579,0.580295,0.101473
2013-01-03,-1.189686,1.546685,0.379646,-0.97227
2013-01-04,0.0992,-0.011729,1.25456,-3.063867
2013-01-05,-1.67397,-1.988168,-2.930495,-0.248812
2013-01-06,0.952886,0.791476,-0.512081,-0.163853


In [25]:
# practically not possible to sort values with two columns 
# df.sort_values(by= ["A","B"])
df.sort_values(by= ["A"])

Unnamed: 0,A,B,C,D
2013-01-04,-3.063867,1.25456,-0.011729,0.0992
2013-01-03,-0.97227,0.379646,1.546685,-1.189686
2013-01-05,-0.248812,-2.930495,-1.988168,-1.67397
2013-01-06,-0.163853,-0.512081,0.791476,0.952886
2013-01-02,0.101473,0.580295,0.689579,-1.195897
2013-01-01,1.412973,-0.853081,1.825165,0.175898


## Getting 

In [26]:
df["A"] # for single columns fetch


2013-01-01    1.412973
2013-01-02    0.101473
2013-01-03   -0.972270
2013-01-04   -3.063867
2013-01-05   -0.248812
2013-01-06   -0.163853
Freq: D, Name: A, dtype: float64

In [27]:
# for getting two or more columns need pass the  column names as 2d array
df[["A","B"]]

Unnamed: 0,A,B
2013-01-01,1.412973,-0.853081
2013-01-02,0.101473,0.580295
2013-01-03,-0.97227,0.379646
2013-01-04,-3.063867,1.25456
2013-01-05,-0.248812,-2.930495
2013-01-06,-0.163853,-0.512081


In [28]:
df

Unnamed: 0,A,B,C,D
2013-01-01,1.412973,-0.853081,1.825165,0.175898
2013-01-02,0.101473,0.580295,0.689579,-1.195897
2013-01-03,-0.97227,0.379646,1.546685,-1.189686
2013-01-04,-3.063867,1.25456,-0.011729,0.0992
2013-01-05,-0.248812,-2.930495,-1.988168,-1.67397
2013-01-06,-0.163853,-0.512081,0.791476,0.952886


In [29]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,1.412973,-0.853081,1.825165,0.175898
2013-01-02,0.101473,0.580295,0.689579,-1.195897
2013-01-03,-0.97227,0.379646,1.546685,-1.189686


In [30]:
df.loc[:,["A","B"]] # loc karne keliye (based on column name) 

Unnamed: 0,A,B
2013-01-01,1.412973,-0.853081
2013-01-02,0.101473,0.580295
2013-01-03,-0.97227,0.379646
2013-01-04,-3.063867,1.25456
2013-01-05,-0.248812,-2.930495
2013-01-06,-0.163853,-0.512081


In [33]:
df.iloc[:,0:4] # index ko loc karne keliye

Unnamed: 0,A,B,C,D
2013-01-01,1.412973,-0.853081,1.825165,0.175898
2013-01-02,0.101473,0.580295,0.689579,-1.195897
2013-01-03,-0.97227,0.379646,1.546685,-1.189686
2013-01-04,-3.063867,1.25456,-0.011729,0.0992
2013-01-05,-0.248812,-2.930495,-1.988168,-1.67397
2013-01-06,-0.163853,-0.512081,0.791476,0.952886


### Selection by position

In [34]:
df.iloc[3]

A   -3.063867
B    1.254560
C   -0.011729
D    0.099200
Name: 2013-01-04 00:00:00, dtype: float64

In [36]:
import seaborn as sns
kashti = sns.load_dataset('titanic')
kashti

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [37]:
kashti.sample(100)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
624,0,3,male,21.00,0,0,16.1000,S,Third,man,True,,Southampton,no,True
316,1,2,female,24.00,1,0,26.0000,S,Second,woman,False,,Southampton,yes,False
786,1,3,female,18.00,0,0,7.4958,S,Third,woman,False,,Southampton,yes,True
609,1,1,female,40.00,0,0,153.4625,S,First,woman,False,C,Southampton,yes,True
424,0,3,male,18.00,1,1,20.2125,S,Third,man,True,,Southampton,no,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305,1,1,male,0.92,1,2,151.5500,S,First,child,False,C,Southampton,yes,False
722,0,2,male,34.00,0,0,13.0000,S,Second,man,True,,Southampton,no,True
547,1,2,male,,0,0,13.8625,C,Second,man,True,,Cherbourg,yes,True
583,0,1,male,36.00,0,0,40.1250,C,First,man,True,A,Cherbourg,no,True


In [38]:
df[df["A"]<5]

Unnamed: 0,A,B,C,D
2013-01-01,1.412973,-0.853081,1.825165,0.175898
2013-01-02,0.101473,0.580295,0.689579,-1.195897
2013-01-03,-0.97227,0.379646,1.546685,-1.189686
2013-01-04,-3.063867,1.25456,-0.011729,0.0992
2013-01-05,-0.248812,-2.930495,-1.988168,-1.67397
2013-01-06,-0.163853,-0.512081,0.791476,0.952886


In [42]:
kashti[kashti["fare"]<5]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
179,0,3,male,36.0,0,0,0.0,S,Third,man,True,,Southampton,no,True
263,0,1,male,40.0,0,0,0.0,S,First,man,True,B,Southampton,no,True
271,1,3,male,25.0,0,0,0.0,S,Third,man,True,,Southampton,yes,True
277,0,2,male,,0,0,0.0,S,Second,man,True,,Southampton,no,True
302,0,3,male,19.0,0,0,0.0,S,Third,man,True,,Southampton,no,True
378,0,3,male,20.0,0,0,4.0125,C,Third,man,True,,Cherbourg,no,True
413,0,2,male,,0,0,0.0,S,Second,man,True,,Southampton,no,True
466,0,2,male,,0,0,0.0,S,Second,man,True,,Southampton,no,True
481,0,2,male,,0,0,0.0,S,Second,man,True,,Southampton,no,True
597,0,3,male,49.0,0,0,0.0,S,Third,man,True,,Southampton,no,True
