# Pandas
### Open source data analysis library written in python.
### It leverages the power and speed of numpy to make data analysis and preprocessing easy for data scientists.
### Provides rich and robust data operations.

# Pandas Data Structure
### Series: 1D array with indexes, it stores a single column or row of data in a Dataframe
### DataFrame: Tabular Spreadsheet like structures representing rows each of which contains one or multiple columns
### 1D array(labeled) capable of holding any type of data = Series
### 2D data(labeled) structure with columns of potentially different types of data = DataFrame

In [1]:
import numpy as np 
import pandas as pd

In [2]:
dictX = {
    "name" : ['a','b','c','d'],
    "marks" : [12,21,13,31],
    "city" : ['W','X','Y','Z']
}

In [3]:
df = pd.DataFrame(dictX)

In [4]:
df

Unnamed: 0,name,marks,city
0,a,12,W
1,b,21,X
2,c,13,Y
3,d,31,Z


In [5]:
df.to_csv('dictX.csv') # Write object to a comma-separated values (csv) file.

In [6]:
df.to_csv('dictXif.csv', index=False) # Without Index

In [7]:
df.head() # Return the first n rows.

Unnamed: 0,name,marks,city
0,a,12,W
1,b,21,X
2,c,13,Y
3,d,31,Z


In [8]:
df.head(2)

Unnamed: 0,name,marks,city
0,a,12,W
1,b,21,X


In [9]:
df.tail(2) # Return the last n rows.

Unnamed: 0,name,marks,city
2,c,13,Y
3,d,31,Z


In [10]:
df.describe() # Generate descriptive statistics.

Unnamed: 0,marks
count,4.0
mean,19.25
std,8.80814
min,12.0
25%,12.75
50%,17.0
75%,23.5
max,31.0


In [11]:
dictXn = pd.read_csv('dictXn.csv') # Read a comma-separated values (csv) file into DataFrame.

In [12]:
dictXn

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 0.1.1.1,Unnamed: 0.1.1.1.1,Unnamed: 0.1.1.1.1.1,Unnamed: 0.1.1.1.1.1.1,Unnamed: 0.1.1.1.1.1.1.1,Unnamed: 0.1.1.1.1.1.1.1.1,Unnamed: 0.1.1.1.1.1.1.1.1.1,Unnamed: 0.1.1.1.1.1.1.1.1.1.1,Unnamed: 0.1.1.1.1.1.1.1.1.1.1.1,Unnamed: 0.1.1.1.1.1.1.1.1.1.1.1.1,Unnamed: 0.1.1.1.1.1.1.1.1.1.1.1.1.1,Unnamed: 0.1.1.1.1.1.1.1.1.1.1.1.1.1.1,sno,name,marks,city
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,a,49,W
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,b,21,X
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,c,13,Y
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,d,31,Z


In [13]:
dictXn['name']

0    a
1    b
2    c
3    d
Name: name, dtype: object

In [14]:
dictXn['name'][0]

'a'

In [15]:
dictXn['marks'][0] = 49

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [16]:
dictXn

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 0.1.1.1,Unnamed: 0.1.1.1.1,Unnamed: 0.1.1.1.1.1,Unnamed: 0.1.1.1.1.1.1,Unnamed: 0.1.1.1.1.1.1.1,Unnamed: 0.1.1.1.1.1.1.1.1,Unnamed: 0.1.1.1.1.1.1.1.1.1,Unnamed: 0.1.1.1.1.1.1.1.1.1.1,Unnamed: 0.1.1.1.1.1.1.1.1.1.1.1,Unnamed: 0.1.1.1.1.1.1.1.1.1.1.1.1,Unnamed: 0.1.1.1.1.1.1.1.1.1.1.1.1.1,Unnamed: 0.1.1.1.1.1.1.1.1.1.1.1.1.1.1,sno,name,marks,city
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,a,49,W
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,b,21,X
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,c,13,Y
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,d,31,Z


In [17]:
dictXn.to_csv('dictXn.csv')

In [18]:
dictXn.index = ['1st','2nd','3rd','4th'] # The index (row labels) of the DataFrame.

In [19]:
dictXn

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 0.1.1.1,Unnamed: 0.1.1.1.1,Unnamed: 0.1.1.1.1.1,Unnamed: 0.1.1.1.1.1.1,Unnamed: 0.1.1.1.1.1.1.1,Unnamed: 0.1.1.1.1.1.1.1.1,Unnamed: 0.1.1.1.1.1.1.1.1.1,Unnamed: 0.1.1.1.1.1.1.1.1.1.1,Unnamed: 0.1.1.1.1.1.1.1.1.1.1.1,Unnamed: 0.1.1.1.1.1.1.1.1.1.1.1.1,Unnamed: 0.1.1.1.1.1.1.1.1.1.1.1.1.1,Unnamed: 0.1.1.1.1.1.1.1.1.1.1.1.1.1.1,sno,name,marks,city
1st,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,a,49,W
2nd,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,b,21,X
3rd,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,c,13,Y
4th,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,d,31,Z


In [20]:
ser = pd.Series(np.random.rand)

In [21]:
ser

0    <built-in method rand of numpy.random.mtrand.R...
dtype: object

In [22]:
ser = pd.Series(np.random.rand(34))

In [23]:
ser

0     0.393870
1     0.563165
2     0.166121
3     0.231289
4     0.950619
5     0.163057
6     0.057049
7     0.348671
8     0.063167
9     0.157772
10    0.447247
11    0.288974
12    0.030435
13    0.374012
14    0.781363
15    0.839109
16    0.934318
17    0.192613
18    0.768015
19    0.942916
20    0.413592
21    0.683006
22    0.084870
23    0.855643
24    0.079555
25    0.054031
26    0.158938
27    0.615507
28    0.649929
29    0.423414
30    0.237623
31    0.009106
32    0.381770
33    0.657211
dtype: float64

In [24]:
newdf = pd.DataFrame(np.random.rand(334,5), index=np.arange(334))

In [25]:
newdf

Unnamed: 0,0,1,2,3,4
0,0.443810,0.656687,0.610947,0.933088,0.321774
1,0.967138,0.381282,0.094856,0.223117,0.050185
2,0.491422,0.118451,0.160718,0.938268,0.552133
3,0.611986,0.082126,0.432831,0.104175,0.522716
4,0.095982,0.643074,0.158596,0.072365,0.380501
...,...,...,...,...,...
329,0.739568,0.214652,0.164648,0.298359,0.000325
330,0.937659,0.173832,0.734524,0.859751,0.576515
331,0.551165,0.803094,0.117047,0.986444,0.583409
332,0.797107,0.249999,0.854375,0.513266,0.767918


In [26]:
newdf.describe()

Unnamed: 0,0,1,2,3,4
count,334.0,334.0,334.0,334.0,334.0
mean,0.485515,0.511346,0.475117,0.509235,0.46759
std,0.283447,0.28315,0.300924,0.289737,0.291759
min,0.001175,0.004553,0.003203,0.007219,0.000325
25%,0.240324,0.263894,0.201466,0.256895,0.212382
50%,0.496807,0.500316,0.442113,0.515628,0.424466
75%,0.71748,0.754071,0.747526,0.765107,0.727013
max,0.991591,0.999527,0.997067,0.999691,0.996278


In [27]:
newdf.dtypes

0    float64
1    float64
2    float64
3    float64
4    float64
dtype: object

In [28]:
newdf[0][0] = 'ash'

In [29]:
newdf.dtypes

0     object
1    float64
2    float64
3    float64
4    float64
dtype: object

In [30]:
newdf.head()

Unnamed: 0,0,1,2,3,4
0,ash,0.656687,0.610947,0.933088,0.321774
1,0.967138,0.381282,0.094856,0.223117,0.050185
2,0.491422,0.118451,0.160718,0.938268,0.552133
3,0.611986,0.082126,0.432831,0.104175,0.522716
4,0.095982,0.643074,0.158596,0.072365,0.380501


In [31]:
newdf.index

Int64Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
            ...
            324, 325, 326, 327, 328, 329, 330, 331, 332, 333],
           dtype='int64', length=334)

In [32]:
newdf.columns

RangeIndex(start=0, stop=5, step=1)

In [33]:
newdf.to_numpy() # Convert the DataFrame to a NumPy array.

array([['ash', 0.656686563652733, 0.6109467196726239, 0.9330882773601392,
        0.3217744863549894],
       [0.9671381005827749, 0.3812821042538239, 0.09485613325004805,
        0.22311729407445025, 0.05018450966610921],
       [0.4914224391044294, 0.11845136399657663, 0.1607182431948182,
        0.9382680256467814, 0.552132812300518],
       ...,
       [0.5511652650649579, 0.8030935613822892, 0.1170470498545968,
        0.9864443825371684, 0.583409241231488],
       [0.7971068440766489, 0.24999875182558684, 0.8543750489111331,
        0.5132655272089959, 0.7679182538022267],
       [0.24752947890083632, 0.6594112651687941, 0.8502064681191058,
        0.6589736208380923, 0.9891099184741948]], dtype=object)

In [34]:
newdf[0][0] = 0.9

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [35]:
newdf.to_numpy()

array([[0.9, 0.656686563652733, 0.6109467196726239, 0.9330882773601392,
        0.3217744863549894],
       [0.9671381005827749, 0.3812821042538239, 0.09485613325004805,
        0.22311729407445025, 0.05018450966610921],
       [0.4914224391044294, 0.11845136399657663, 0.1607182431948182,
        0.9382680256467814, 0.552132812300518],
       ...,
       [0.5511652650649579, 0.8030935613822892, 0.1170470498545968,
        0.9864443825371684, 0.583409241231488],
       [0.7971068440766489, 0.24999875182558684, 0.8543750489111331,
        0.5132655272089959, 0.7679182538022267],
       [0.24752947890083632, 0.6594112651687941, 0.8502064681191058,
        0.6589736208380923, 0.9891099184741948]], dtype=object)

In [36]:
newdf.T # Transpose

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,324,325,326,327,328,329,330,331,332,333
0,0.9,0.967138,0.491422,0.611986,0.095982,0.606373,0.30179,0.886338,0.511665,0.144796,...,0.020965,0.247791,0.596959,0.123085,0.682206,0.739568,0.937659,0.551165,0.797107,0.247529
1,0.656687,0.381282,0.118451,0.082126,0.643074,0.724032,0.391756,0.370041,0.984893,0.650148,...,0.747255,0.62123,0.628755,0.421213,0.939226,0.214652,0.173832,0.803094,0.249999,0.659411
2,0.610947,0.094856,0.160718,0.432831,0.158596,0.151007,0.892546,0.564142,0.558747,0.191856,...,0.102575,0.020547,0.873624,0.840569,0.823451,0.164648,0.734524,0.117047,0.854375,0.850206
3,0.933088,0.223117,0.938268,0.104175,0.072365,0.081908,0.232492,0.72587,0.083075,0.287759,...,0.641626,0.388302,0.369355,0.562521,0.517991,0.298359,0.859751,0.986444,0.513266,0.658974
4,0.321774,0.050185,0.552133,0.522716,0.380501,0.099598,0.118668,0.502882,0.138249,0.113505,...,0.162104,0.423738,0.650963,0.738801,0.476762,0.000325,0.576515,0.583409,0.767918,0.98911


In [37]:
newdf.sort_index(axis=0, ascending=False) # axis=0 (Row) by default true 1 for column

Unnamed: 0,0,1,2,3,4
333,0.247529,0.659411,0.850206,0.658974,0.989110
332,0.797107,0.249999,0.854375,0.513266,0.767918
331,0.551165,0.803094,0.117047,0.986444,0.583409
330,0.937659,0.173832,0.734524,0.859751,0.576515
329,0.739568,0.214652,0.164648,0.298359,0.000325
...,...,...,...,...,...
4,0.095982,0.643074,0.158596,0.072365,0.380501
3,0.611986,0.082126,0.432831,0.104175,0.522716
2,0.491422,0.118451,0.160718,0.938268,0.552133
1,0.967138,0.381282,0.094856,0.223117,0.050185


In [38]:
type(newdf[0])

pandas.core.series.Series

In [39]:
newdf2 = newdf # Only view but are same Changes are seen

In [40]:
newdf2[0][0] = 9783

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [41]:
newdf

Unnamed: 0,0,1,2,3,4
0,9783,0.656687,0.610947,0.933088,0.321774
1,0.967138,0.381282,0.094856,0.223117,0.050185
2,0.491422,0.118451,0.160718,0.938268,0.552133
3,0.611986,0.082126,0.432831,0.104175,0.522716
4,0.095982,0.643074,0.158596,0.072365,0.380501
...,...,...,...,...,...
329,0.739568,0.214652,0.164648,0.298359,0.000325
330,0.937659,0.173832,0.734524,0.859751,0.576515
331,0.551165,0.803094,0.117047,0.986444,0.583409
332,0.797107,0.249999,0.854375,0.513266,0.767918


In [42]:
newdf3 = newdf.copy() # Makes copy

In [43]:
newdf3[0][0] = 9

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [44]:
newdf3

Unnamed: 0,0,1,2,3,4
0,9,0.656687,0.610947,0.933088,0.321774
1,0.967138,0.381282,0.094856,0.223117,0.050185
2,0.491422,0.118451,0.160718,0.938268,0.552133
3,0.611986,0.082126,0.432831,0.104175,0.522716
4,0.095982,0.643074,0.158596,0.072365,0.380501
...,...,...,...,...,...
329,0.739568,0.214652,0.164648,0.298359,0.000325
330,0.937659,0.173832,0.734524,0.859751,0.576515
331,0.551165,0.803094,0.117047,0.986444,0.583409
332,0.797107,0.249999,0.854375,0.513266,0.767918


In [45]:
newdf

Unnamed: 0,0,1,2,3,4
0,9783,0.656687,0.610947,0.933088,0.321774
1,0.967138,0.381282,0.094856,0.223117,0.050185
2,0.491422,0.118451,0.160718,0.938268,0.552133
3,0.611986,0.082126,0.432831,0.104175,0.522716
4,0.095982,0.643074,0.158596,0.072365,0.380501
...,...,...,...,...,...
329,0.739568,0.214652,0.164648,0.298359,0.000325
330,0.937659,0.173832,0.734524,0.859751,0.576515
331,0.551165,0.803094,0.117047,0.986444,0.583409
332,0.797107,0.249999,0.854375,0.513266,0.767918


In [46]:
newdf.loc[0,0] =654 # chnages value no error

In [47]:
newdf

Unnamed: 0,0,1,2,3,4
0,654,0.656687,0.610947,0.933088,0.321774
1,0.967138,0.381282,0.094856,0.223117,0.050185
2,0.491422,0.118451,0.160718,0.938268,0.552133
3,0.611986,0.082126,0.432831,0.104175,0.522716
4,0.095982,0.643074,0.158596,0.072365,0.380501
...,...,...,...,...,...
329,0.739568,0.214652,0.164648,0.298359,0.000325
330,0.937659,0.173832,0.734524,0.859751,0.576515
331,0.551165,0.803094,0.117047,0.986444,0.583409
332,0.797107,0.249999,0.854375,0.513266,0.767918


In [48]:
newdf.columns = list("abcde")

In [49]:
newdf

Unnamed: 0,a,b,c,d,e
0,654,0.656687,0.610947,0.933088,0.321774
1,0.967138,0.381282,0.094856,0.223117,0.050185
2,0.491422,0.118451,0.160718,0.938268,0.552133
3,0.611986,0.082126,0.432831,0.104175,0.522716
4,0.095982,0.643074,0.158596,0.072365,0.380501
...,...,...,...,...,...
329,0.739568,0.214652,0.164648,0.298359,0.000325
330,0.937659,0.173832,0.734524,0.859751,0.576515
331,0.551165,0.803094,0.117047,0.986444,0.583409
332,0.797107,0.249999,0.854375,0.513266,0.767918


In [50]:
newdf.loc[0,0] =654

In [51]:
newdf.head(2)

Unnamed: 0,a,b,c,d,e,0
0,654.0,0.656687,0.610947,0.933088,0.321774,654.0
1,0.967138,0.381282,0.094856,0.223117,0.050185,


In [52]:
newdf.loc[0,'a'] =64

In [53]:
newdf.head(2)

Unnamed: 0,a,b,c,d,e,0
0,64.0,0.656687,0.610947,0.933088,0.321774,654.0
1,0.967138,0.381282,0.094856,0.223117,0.050185,


In [54]:
newdf = newdf.drop(0,axis=1) 

In [55]:
newdf

Unnamed: 0,a,b,c,d,e
0,64,0.656687,0.610947,0.933088,0.321774
1,0.967138,0.381282,0.094856,0.223117,0.050185
2,0.491422,0.118451,0.160718,0.938268,0.552133
3,0.611986,0.082126,0.432831,0.104175,0.522716
4,0.095982,0.643074,0.158596,0.072365,0.380501
...,...,...,...,...,...
329,0.739568,0.214652,0.164648,0.298359,0.000325
330,0.937659,0.173832,0.734524,0.859751,0.576515
331,0.551165,0.803094,0.117047,0.986444,0.583409
332,0.797107,0.249999,0.854375,0.513266,0.767918


In [56]:
newdf.loc[[1,2], ['c', 'd']] # Original not changed

Unnamed: 0,c,d
1,0.094856,0.223117
2,0.160718,0.938268


In [57]:
newdf.loc[:, ['c', 'd']] # All rows

Unnamed: 0,c,d
0,0.610947,0.933088
1,0.094856,0.223117
2,0.160718,0.938268
3,0.432831,0.104175
4,0.158596,0.072365
...,...,...
329,0.164648,0.298359
330,0.734524,0.859751
331,0.117047,0.986444
332,0.854375,0.513266


In [58]:
newdf.loc[[1,2], :] # All columns

Unnamed: 0,a,b,c,d,e
1,0.967138,0.381282,0.094856,0.223117,0.050185
2,0.491422,0.118451,0.160718,0.938268,0.552133


In [59]:
newdf.loc[(newdf['a']<0.3)]

Unnamed: 0,a,b,c,d,e
4,0.095982,0.643074,0.158596,0.072365,0.380501
9,0.144796,0.650148,0.191856,0.287759,0.113505
20,0.292013,0.011845,0.160110,0.478455,0.220130
22,0.240151,0.447385,0.627967,0.218365,0.798461
23,0.096557,0.121303,0.115051,0.263068,0.947936
...,...,...,...,...,...
319,0.037097,0.699660,0.848091,0.381312,0.403218
324,0.020965,0.747255,0.102575,0.641626,0.162104
325,0.247791,0.621230,0.020547,0.388302,0.423738
327,0.123085,0.421213,0.840569,0.562521,0.738801


In [60]:
newdf.loc[(newdf['a']<0.3) & (newdf['c']>0.1)]

Unnamed: 0,a,b,c,d,e
4,0.095982,0.643074,0.158596,0.072365,0.380501
9,0.144796,0.650148,0.191856,0.287759,0.113505
20,0.292013,0.011845,0.160110,0.478455,0.220130
22,0.240151,0.447385,0.627967,0.218365,0.798461
23,0.096557,0.121303,0.115051,0.263068,0.947936
...,...,...,...,...,...
318,0.199337,0.102124,0.208636,0.037054,0.980866
319,0.037097,0.699660,0.848091,0.381312,0.403218
324,0.020965,0.747255,0.102575,0.641626,0.162104
327,0.123085,0.421213,0.840569,0.562521,0.738801


## loc- Access a group of rows and columns by label(s) or a boolean array.
## iloc - Purely integer-location based indexing for selection by position.

In [61]:
newdf.head(2)

Unnamed: 0,a,b,c,d,e
0,64.0,0.656687,0.610947,0.933088,0.321774
1,0.967138,0.381282,0.094856,0.223117,0.050185


In [62]:
newdf.iloc[0,4] # 0th row 5th column(0,1,2,3,4)

0.3217744863549894

In [63]:
newdf.iloc[[0,1],[1,2]]

Unnamed: 0,b,c
0,0.656687,0.610947
1,0.381282,0.094856


In [64]:
newdf.drop([0]) # Drop specified labels from rows or columns.

Unnamed: 0,a,b,c,d,e
1,0.967138,0.381282,0.094856,0.223117,0.050185
2,0.491422,0.118451,0.160718,0.938268,0.552133
3,0.611986,0.082126,0.432831,0.104175,0.522716
4,0.095982,0.643074,0.158596,0.072365,0.380501
5,0.606373,0.724032,0.151007,0.081908,0.099598
...,...,...,...,...,...
329,0.739568,0.214652,0.164648,0.298359,0.000325
330,0.937659,0.173832,0.734524,0.859751,0.576515
331,0.551165,0.803094,0.117047,0.986444,0.583409
332,0.797107,0.249999,0.854375,0.513266,0.767918


In [65]:
newdf.drop(['a', 'b'], axis=1)

Unnamed: 0,c,d,e
0,0.610947,0.933088,0.321774
1,0.094856,0.223117,0.050185
2,0.160718,0.938268,0.552133
3,0.432831,0.104175,0.522716
4,0.158596,0.072365,0.380501
...,...,...,...
329,0.164648,0.298359,0.000325
330,0.734524,0.859751,0.576515
331,0.117047,0.986444,0.583409
332,0.854375,0.513266,0.767918


In [66]:
newdf.drop(['a', 'b'], axis=1, inplace=True) # do operation on original

In [67]:
newdf

Unnamed: 0,c,d,e
0,0.610947,0.933088,0.321774
1,0.094856,0.223117,0.050185
2,0.160718,0.938268,0.552133
3,0.432831,0.104175,0.522716
4,0.158596,0.072365,0.380501
...,...,...,...
329,0.164648,0.298359,0.000325
330,0.734524,0.859751,0.576515
331,0.117047,0.986444,0.583409
332,0.854375,0.513266,0.767918


In [68]:
newdf.drop([1,5], axis=0,inplace=True)

In [69]:
newdf

Unnamed: 0,c,d,e
0,0.610947,0.933088,0.321774
2,0.160718,0.938268,0.552133
3,0.432831,0.104175,0.522716
4,0.158596,0.072365,0.380501
6,0.892546,0.232492,0.118668
...,...,...,...
329,0.164648,0.298359,0.000325
330,0.734524,0.859751,0.576515
331,0.117047,0.986444,0.583409
332,0.854375,0.513266,0.767918


In [70]:
newdf.reset_index() # Reset the index, or a level of it.

Unnamed: 0,index,c,d,e
0,0,0.610947,0.933088,0.321774
1,2,0.160718,0.938268,0.552133
2,3,0.432831,0.104175,0.522716
3,4,0.158596,0.072365,0.380501
4,6,0.892546,0.232492,0.118668
...,...,...,...,...
327,329,0.164648,0.298359,0.000325
328,330,0.734524,0.859751,0.576515
329,331,0.117047,0.986444,0.583409
330,332,0.854375,0.513266,0.767918


In [71]:
newdf.reset_index(drop=True, inplace=True) # Removes index column

In [72]:
newdf

Unnamed: 0,c,d,e
0,0.610947,0.933088,0.321774
1,0.160718,0.938268,0.552133
2,0.432831,0.104175,0.522716
3,0.158596,0.072365,0.380501
4,0.892546,0.232492,0.118668
...,...,...,...
327,0.164648,0.298359,0.000325
328,0.734524,0.859751,0.576515
329,0.117047,0.986444,0.583409
330,0.854375,0.513266,0.767918


In [73]:
newdf['c'].isnull() # Detect missing values.

0      False
1      False
2      False
3      False
4      False
       ...  
327    False
328    False
329    False
330    False
331    False
Name: c, Length: 332, dtype: bool

In [74]:
newdf['c'] = None # or newdf.loc[:, ['c']] = None # This is best

In [75]:
newdf['c'].isnull()

0      True
1      True
2      True
3      True
4      True
       ... 
327    True
328    True
329    True
330    True
331    True
Name: c, Length: 332, dtype: bool

In [76]:
df = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'],
                   "toy": [np.nan, 'Batmobile', 'Bullwhip'],
                   "born": [pd.NaT, pd.Timestamp("1940-04-25"),
                            pd.NaT]})

In [77]:
df

Unnamed: 0,name,toy,born
0,Alfred,,NaT
1,Batman,Batmobile,1940-04-25
2,Catwoman,Bullwhip,NaT


In [78]:
df.dropna() # Remove missing values.

Unnamed: 0,name,toy,born
1,Batman,Batmobile,1940-04-25


In [79]:
df = pd.DataFrame({
    'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'],
    'style': ['cup', 'cup', 'cup', 'pack', 'pack'],
    'rating': [4, 4, 3.5, 15, 5]
})

In [80]:
df

Unnamed: 0,brand,style,rating
0,Yum Yum,cup,4.0
1,Yum Yum,cup,4.0
2,Indomie,cup,3.5
3,Indomie,pack,15.0
4,Indomie,pack,5.0


In [81]:
df.drop_duplicates() # Return DataFrame with duplicate rows removed.

Unnamed: 0,brand,style,rating
0,Yum Yum,cup,4.0
2,Indomie,cup,3.5
3,Indomie,pack,15.0
4,Indomie,pack,5.0


In [82]:
df.info() # Print a concise summary of a DataFrame.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   brand   5 non-null      object 
 1   style   5 non-null      object 
 2   rating  5 non-null      float64
dtypes: float64(1), object(2)
memory usage: 248.0+ bytes


In [83]:
df.shape # Return a tuple representing the dimensionality of the DataFrame.

(5, 3)

In [84]:
data = pd.read_excel('data.xlsx') #Read an Excel file into a pandas DataFrame.

In [85]:
data

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 0.1.1.1,train no2,name2,marks2,city2
0,0,34,34,34,34,a,12,W
1,1,1,1,1,1,b,21,X
2,2,2,2,2,2,c,13,Y
3,3,3,3,3,3,d,31,Z


In [86]:
data = pd.read_excel('data.xlsx', sheet_name='Sheet2')

In [87]:
data.iloc[0,0] = 34

In [88]:
data.to_excel('data.xlsx',sheet_name='Sheet2') # Write object to an Excel sheet. Sheet 1 is lost.

In [89]:
df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'],
                    'value': [1, 2, 3, 5]})

In [90]:
df2 = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'],
                    'value': [5, 6, 7, 8]})

In [91]:
df1

Unnamed: 0,lkey,value
0,foo,1
1,bar,2
2,baz,3
3,foo,5


In [92]:
df2

Unnamed: 0,rkey,value
0,foo,5
1,bar,6
2,baz,7
3,foo,8


In [93]:
df1.merge(df2, left_on='lkey', right_on='rkey') 
# Merge DataFrame or named Series objects with a database-style join.
# Merge df1 and df2 on the lkey and rkey columns.
# The value columns have the default suffixes, _x and _y, appended.


Unnamed: 0,lkey,value_x,rkey,value_y
0,foo,1,foo,5
1,foo,1,foo,8
2,foo,5,foo,5
3,foo,5,foo,8
4,bar,2,bar,6
5,baz,3,baz,7


In [94]:
df1.merge(df2, left_on='lkey', right_on='rkey',
          suffixes=('_left', '_right'))
# Merge DataFrames df1 and df2 with specified left and right suffixes appended to any overlapping columns.

Unnamed: 0,lkey,value_left,rkey,value_right
0,foo,1,foo,5
1,foo,1,foo,8
2,foo,5,foo,5
3,foo,5,foo,8
4,bar,2,bar,6
5,baz,3,baz,7
