In [1]:
import numpy as np
import pandas as pd
# Pandas is an open source data analysis library written in python
# It leverages the power and speed of numpy to make data analysis and preprocessing easy for data scientists
# It provides rich and highly robust data operations

In [2]:
# Pandas has two types of data structures:
# a) Series — It's a one dimensional array with indexes, it stores a single column or row of data in a Dataframe
# b) Dataframe — It's a tabular spreadsheet like structure representing rows each of which contains one or multiple columns
# A one-dimensional array(labeled) capable of holding any type of data— Series
# A two-dimensional data (labeled) structure with columns of potentially different types of data - DataFrame

In [3]:
dict1 = {
    "name": ['harry', 'rohan', 'skillf', 'shubh'],
    "marks": [92, 34, 24, 17],
    "city": ['rampur', 'kolkata', 'bareilly', 'antartica']
}

In [4]:
df = pd.DataFrame(dict1) 
# we will store data frame in a variable
# data frame is like a excel sheet. with data frame help we can do fast indexing, we can also able to use numpy function

In [5]:
df

Unnamed: 0,name,marks,city
0,harry,92,rampur
1,rohan,34,kolkata
2,skillf,24,bareilly
3,shubh,17,antartica


In [6]:
df.to_csv('friends.csv')   
# convert data frame into csv file
# syntax : df.to_csv ('filename.csv')

In [7]:
# if we don't want index in our csv file then
df.to_csv('friends_index_false.csv' , index = False)
# false first word should be in capital letter
# NOTE: index means row, aur column ko column hi bolte hain

In [8]:
df.head (2)
# ye suru ka 2 index dikhayega

Unnamed: 0,name,marks,city
0,harry,92,rampur
1,rohan,34,kolkata


In [9]:
df.head (3)
# ye suru ka 3 index dikhayega

Unnamed: 0,name,marks,city
0,harry,92,rampur
1,rohan,34,kolkata
2,skillf,24,bareilly


In [10]:
df.tail (2)
# ye last ka 2 index dikhayega

Unnamed: 0,name,marks,city
2,skillf,24,bareilly
3,shubh,17,antartica


In [11]:
df.tail (3)
# ye last ka 3 index dikhayega

Unnamed: 0,name,marks,city
1,rohan,34,kolkata
2,skillf,24,bareilly
3,shubh,17,antartica


In [12]:
df.describe ()
# jo bhi numerical column data hain uska count, mean.std (standard deviation),min,25%,50%,75% and max value btayga

Unnamed: 0,marks
count,4.0
mean,41.75
std,34.21866
min,17.0
25%,22.25
50%,29.0
75%,48.5
max,92.0


In [13]:
# Read a csv file and store it in a variable
harry = pd.read_csv('vikram.csv')

In [14]:
harry

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Train no,Speed,City
0,0,0,0,12233,100,Delhi
1,1,1,1,12351,57,Kolkata
2,2,2,2,12561,87,mumbai
3,3,3,3,24376,45,patna
4,4,4,4,23425,90,asansol


In [15]:
harry['City']

0      Delhi
1    Kolkata
2     mumbai
3      patna
4    asansol
Name: City, dtype: object

In [16]:
harry['Train no']

0    12233
1    12351
2    12561
3    24376
4    23425
Name: Train no, dtype: int64

In [17]:
harry['Speed ']
# there is a space gap after Speed

0    100
1     57
2     87
3     45
4     90
Name: Speed , dtype: int64

In [18]:
harry [['Train no', 'Speed ']]
# selecting more than one column at once

Unnamed: 0,Train no,Speed
0,12233,100
1,12351,57
2,12561,87
3,24376,45
4,23425,90


In [19]:
harry['Speed '].head(2)

0    100
1     57
Name: Speed , dtype: int64

In [20]:
harry['Speed '][0]
# shows the value of speed is 67

np.int64(100)

In [21]:
harry['Speed '][0] = 100
# we update the value of speed to 100
# this method is not advisable

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  harry['Speed '][0] = 100


In [22]:
harry['Speed ']

0    100
1     57
2     87
3     45
4     90
Name: Speed , dtype: int64

In [23]:
harry.to_csv ('vikram.csv')
# we are updating changes to csv file

In [24]:
# how to change index of csv file
harry.index = ['first','second','third','fourth','fifth']

In [25]:
harry

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Train no,Speed,City
first,0,0,0,12233,100,Delhi
second,1,1,1,12351,57,Kolkata
third,2,2,2,12561,87,mumbai
fourth,3,3,3,24376,45,patna
fifth,4,4,4,23425,90,asansol


In [26]:
# ----------------------------------------------------------------------------------------------------------------------------

In [27]:
# Pandas has two types of data structures:
# a) Series — It's a one dimensional array with indexes, it stores a single column or row of data in a Dataframe
# b) Dataframe — It's a tabular spreadsheet like structure representing rows each of which contains one or multiple columns
# A one-dimensional array(labeled) capable of holding any type of data— Series
# A two-dimensional data (labeled) structure with columns of potentially different types of data - DataFrame

In [28]:
ser = pd.Series (np.random.rand())

In [29]:
type(ser)

pandas.core.series.Series

In [30]:
ser = pd.Series (np.random.rand(34))

In [31]:
ser

0     0.626756
1     0.315319
2     0.435170
3     0.397518
4     0.875150
5     0.107050
6     0.168068
7     0.944370
8     0.590435
9     0.272549
10    0.807792
11    0.776393
12    0.749477
13    0.053197
14    0.840750
15    0.285828
16    0.511469
17    0.691760
18    0.556907
19    0.568163
20    0.377624
21    0.683902
22    0.265268
23    0.342310
24    0.711535
25    0.076990
26    0.495826
27    0.741961
28    0.355626
29    0.962132
30    0.218763
31    0.063665
32    0.515830
33    0.363938
dtype: float64

In [32]:
# Now, we are going to make new DataFrame

In [33]:
newdf = pd.DataFrame(np.random.rand(334,5))

In [34]:
newdf

Unnamed: 0,0,1,2,3,4
0,0.396913,0.818824,0.548025,0.022180,0.140742
1,0.728445,0.114544,0.725716,0.390201,0.520081
2,0.374800,0.846142,0.021044,0.312407,0.348585
3,0.059859,0.110872,0.069597,0.083635,0.202105
4,0.765796,0.516199,0.289079,0.867226,0.284108
...,...,...,...,...,...
329,0.879090,0.233078,0.170570,0.196284,0.938480
330,0.420148,0.470158,0.460541,0.211254,0.847370
331,0.276385,0.873813,0.479600,0.067746,0.684815
332,0.235353,0.907165,0.590595,0.208192,0.136079


In [35]:
newdf = pd.DataFrame(np.random.rand(334,5), index = np.arange(334))

In [36]:
newdf

Unnamed: 0,0,1,2,3,4
0,0.949929,0.963387,0.415592,0.366470,0.009209
1,0.513208,0.526904,0.655313,0.967006,0.870069
2,0.714780,0.399358,0.118568,0.217121,0.201434
3,0.082782,0.774036,0.525955,0.424344,0.672570
4,0.436701,0.449087,0.223396,0.451097,0.233582
...,...,...,...,...,...
329,0.363302,0.354862,0.295935,0.927258,0.178281
330,0.278179,0.629779,0.070092,0.747344,0.037489
331,0.406437,0.515095,0.250246,0.310815,0.567333
332,0.652980,0.314201,0.129195,0.588565,0.296024


In [37]:
type(newdf)

pandas.core.frame.DataFrame

In [38]:
newdf.describe()

Unnamed: 0,0,1,2,3,4
count,334.0,334.0,334.0,334.0,334.0
mean,0.501823,0.495386,0.52679,0.494512,0.512515
std,0.28007,0.283264,0.291658,0.287202,0.2951
min,0.004693,0.003449,0.00165,0.005005,0.00312
25%,0.269934,0.256146,0.255207,0.235579,0.254846
50%,0.492902,0.504074,0.530948,0.503073,0.527954
75%,0.749072,0.723562,0.783408,0.732544,0.773469
max,0.995034,0.989782,0.99978,0.998757,0.991942


In [39]:
newdf.dtypes
# ye har ek column ka data type bta raha hain

0    float64
1    float64
2    float64
3    float64
4    float64
dtype: object

In [40]:
newdf.head()

Unnamed: 0,0,1,2,3,4
0,0.949929,0.963387,0.415592,0.36647,0.009209
1,0.513208,0.526904,0.655313,0.967006,0.870069
2,0.71478,0.399358,0.118568,0.217121,0.201434
3,0.082782,0.774036,0.525955,0.424344,0.67257
4,0.436701,0.449087,0.223396,0.451097,0.233582


In [41]:
newdf[0][0] = "Vikram"

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  newdf[0][0] = "Vikram"
  newdf[0][0] = "Vikram"


In [42]:
newdf.head()

Unnamed: 0,0,1,2,3,4
0,Vikram,0.963387,0.415592,0.36647,0.009209
1,0.513208,0.526904,0.655313,0.967006,0.870069
2,0.71478,0.399358,0.118568,0.217121,0.201434
3,0.082782,0.774036,0.525955,0.424344,0.67257
4,0.436701,0.449087,0.223396,0.451097,0.233582


In [43]:
newdf.dtypes
# now column 0 has become object because we have put string in column 0

0     object
1    float64
2    float64
3    float64
4    float64
dtype: object

In [44]:
# now, we will print the index
newdf.index

Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
       ...
       324, 325, 326, 327, 328, 329, 330, 331, 332, 333],
      dtype='int64', length=334)

In [46]:
# now, we will print the column of dataframe
newdf.columns

RangeIndex(start=0, stop=5, step=1)

In [47]:
newdf[0][0] = 0.34

In [48]:
newdf.head()

Unnamed: 0,0,1,2,3,4
0,0.34,0.963387,0.415592,0.36647,0.009209
1,0.513208,0.526904,0.655313,0.967006,0.870069
2,0.71478,0.399358,0.118568,0.217121,0.201434
3,0.082782,0.774036,0.525955,0.424344,0.67257
4,0.436701,0.449087,0.223396,0.451097,0.233582


In [49]:
# now, we will try to convert into numpy array
newdf.to_numpy()

array([[0.34, 0.9633872358812178, 0.4155924329704418, 0.3664699745247968,
        0.009208565870978402],
       [0.5132082373219299, 0.5269039179911068, 0.6553127847270387,
        0.9670061022271741, 0.8700688508473257],
       [0.7147798233445254, 0.3993577572554049, 0.11856799201330948,
        0.2171205642636037, 0.20143379547184692],
       ...,
       [0.40643748633288723, 0.5150952107737653, 0.25024603854009897,
        0.3108150574081959, 0.5673329271049887],
       [0.6529804616738357, 0.3142014953007207, 0.12919462571284035,
        0.5885648573262635, 0.29602419281905423],
       [0.7262693634370073, 0.2847661184170034, 0.33080628399125944,
        0.42976121544769463, 0.8819324902464585]], dtype=object)

In [51]:
newdf.T
# isse is data ka transpose ho jayga
# same as matrix transpose

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,324,325,326,327,328,329,330,331,332,333
0,0.34,0.513208,0.71478,0.082782,0.436701,0.464861,0.8261,0.173713,0.388937,0.752856,...,0.256198,0.006674,0.521671,0.204015,0.965527,0.363302,0.278179,0.406437,0.65298,0.726269
1,0.963387,0.526904,0.399358,0.774036,0.449087,0.895027,0.237815,0.87701,0.894457,0.584155,...,0.613426,0.685711,0.664704,0.390416,0.034251,0.354862,0.629779,0.515095,0.314201,0.284766
2,0.415592,0.655313,0.118568,0.525955,0.223396,0.238711,0.004191,0.807477,0.726462,0.460621,...,0.441532,0.339204,0.068066,0.309844,0.898121,0.295935,0.070092,0.250246,0.129195,0.330806
3,0.36647,0.967006,0.217121,0.424344,0.451097,0.445944,0.851193,0.610304,0.104961,0.732905,...,0.046955,0.208135,0.232627,0.33086,0.188926,0.927258,0.747344,0.310815,0.588565,0.429761
4,0.009209,0.870069,0.201434,0.67257,0.233582,0.701051,0.77423,0.453473,0.948999,0.088492,...,0.583456,0.620995,0.489232,0.529609,0.572789,0.178281,0.037489,0.567333,0.296024,0.881932


In [54]:
# we will sort the index of data in descending order
newdf.sort_index (axis=0, ascending = False)
# by default ascending humesha True rehta hain
# axis yahan pe row ko represent karta hain

Unnamed: 0,0,1,2,3,4
333,0.726269,0.284766,0.330806,0.429761,0.881932
332,0.65298,0.314201,0.129195,0.588565,0.296024
331,0.406437,0.515095,0.250246,0.310815,0.567333
330,0.278179,0.629779,0.070092,0.747344,0.037489
329,0.363302,0.354862,0.295935,0.927258,0.178281
...,...,...,...,...,...
4,0.436701,0.449087,0.223396,0.451097,0.233582
3,0.082782,0.774036,0.525955,0.424344,0.672570
2,0.71478,0.399358,0.118568,0.217121,0.201434
1,0.513208,0.526904,0.655313,0.967006,0.870069


In [55]:
# NOTE: in 2-D numpy array
# axis = 0  [this represents row]
# axis = 1 [this represents column]

In [56]:
newdf.sort_index (axis=1, ascending = False)
# we have sorted the column in descending order

Unnamed: 0,4,3,2,1,0
0,0.009209,0.366470,0.415592,0.963387,0.34
1,0.870069,0.967006,0.655313,0.526904,0.513208
2,0.201434,0.217121,0.118568,0.399358,0.71478
3,0.672570,0.424344,0.525955,0.774036,0.082782
4,0.233582,0.451097,0.223396,0.449087,0.436701
...,...,...,...,...,...
329,0.178281,0.927258,0.295935,0.354862,0.363302
330,0.037489,0.747344,0.070092,0.629779,0.278179
331,0.567333,0.310815,0.250246,0.515095,0.406437
332,0.296024,0.588565,0.129195,0.314201,0.65298


In [57]:
newdf.head()

Unnamed: 0,0,1,2,3,4
0,0.34,0.963387,0.415592,0.36647,0.009209
1,0.513208,0.526904,0.655313,0.967006,0.870069
2,0.71478,0.399358,0.118568,0.217121,0.201434
3,0.082782,0.774036,0.525955,0.424344,0.67257
4,0.436701,0.449087,0.223396,0.451097,0.233582


In [58]:
newdf[0]

0          0.34
1      0.513208
2       0.71478
3      0.082782
4      0.436701
         ...   
329    0.363302
330    0.278179
331    0.406437
332     0.65298
333    0.726269
Name: 0, Length: 334, dtype: object

In [59]:
type (newdf[0])
# from here we can conclude that series ke combination se data frame banta hain

pandas.core.series.Series