#### Types of Data Analysis
- Descriptive Analysis - What happend?
- Diagnostic Analysis - Why did it happen?
- Predictive Analysis - What is likely to happen in future?
- Prescriptive Analysis - What is the best of action to take?

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Series are the columns
s = pd.Series([1,34,53,12,3])
s

0     1
1    34
2    53
3    12
4     3
dtype: int64

In [3]:
s = pd.Series([12,3,21,3,np.nan,21])
s

0    12.0
1     3.0
2    21.0
3     3.0
4     NaN
5    21.0
dtype: float64

In [4]:
# data - contains data
# s - series
data = np.array(['a','b','c','d'])
s = pd.Series(data,index=[100,101,102,103])
print(s)

100    a
101    b
102    c
103    d
dtype: object


In [5]:
dates = pd.date_range("20230410", periods=20)
dates

DatetimeIndex(['2023-04-10', '2023-04-11', '2023-04-12', '2023-04-13',
               '2023-04-14', '2023-04-15', '2023-04-16', '2023-04-17',
               '2023-04-18', '2023-04-19', '2023-04-20', '2023-04-21',
               '2023-04-22', '2023-04-23', '2023-04-24', '2023-04-25',
               '2023-04-26', '2023-04-27', '2023-04-28', '2023-04-29'],
              dtype='datetime64[ns]', freq='D')

In [6]:
dates = pd.date_range("20230410", periods=8)
dates
df = pd.DataFrame(np.random.randn(8,5), index=dates, columns=list('SWXYZ'))
df

Unnamed: 0,S,W,X,Y,Z
2023-04-10,1.865327,-0.370056,-0.292629,0.58317,-0.746256
2023-04-11,-0.774915,1.123016,1.066117,0.346979,0.221392
2023-04-12,-0.531896,-2.108835,1.542141,-1.214259,0.595663
2023-04-13,-0.724759,-0.187055,0.358464,-2.086899,0.587351
2023-04-14,0.076294,-0.531422,-1.398154,0.48507,-0.14975
2023-04-15,-0.106939,0.496983,1.356887,0.625102,-0.39706
2023-04-16,1.72394,0.159986,-0.819459,-0.160092,1.893299
2023-04-17,-0.007501,-0.223142,-1.457893,0.809979,1.564545


In [7]:
df2 = pd.DataFrame(
{
    "A": 1.0,
    "B": pd.Timestamp("20230410"),
    "C": pd.Series(1, index=list(range(4)), dtype="float64"),
    "D": np.array([3] * 4, dtype="int32"),
    "E": pd.Categorical(["Test","train","test","train"]),
    "F": "foo"
}
)
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2023-04-10,1.0,3,Test,foo
1,1.0,2023-04-10,1.0,3,train,foo
2,1.0,2023-04-10,1.0,3,test,foo
3,1.0,2023-04-10,1.0,3,train,foo


In [8]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float64
D             int32
E          category
F            object
dtype: object

In [9]:
df.head(2)

Unnamed: 0,S,W,X,Y,Z
2023-04-10,1.865327,-0.370056,-0.292629,0.58317,-0.746256
2023-04-11,-0.774915,1.123016,1.066117,0.346979,0.221392


In [10]:
df.tail(2)

Unnamed: 0,S,W,X,Y,Z
2023-04-16,1.72394,0.159986,-0.819459,-0.160092,1.893299
2023-04-17,-0.007501,-0.223142,-1.457893,0.809979,1.564545


In [11]:
df.index

DatetimeIndex(['2023-04-10', '2023-04-11', '2023-04-12', '2023-04-13',
               '2023-04-14', '2023-04-15', '2023-04-16', '2023-04-17'],
              dtype='datetime64[ns]', freq='D')

In [12]:
df2.index

Int64Index([0, 1, 2, 3], dtype='int64')

In [13]:
# converting dataframe to numpy array
df.to_numpy()

array([[ 1.86532738, -0.37005638, -0.29262898,  0.58317044, -0.74625645],
       [-0.77491485,  1.12301552,  1.06611743,  0.34697948,  0.2213922 ],
       [-0.53189563, -2.10883456,  1.5421412 , -1.2142588 ,  0.59566325],
       [-0.72475932, -0.18705542,  0.35846388, -2.08689932,  0.58735092],
       [ 0.07629449, -0.53142188, -1.39815356,  0.48507016, -0.14974965],
       [-0.10693911,  0.49698286,  1.35688737,  0.62510203, -0.39706043],
       [ 1.72393959,  0.15998559, -0.81945892, -0.1600923 ,  1.89329924],
       [-0.00750066, -0.22314198, -1.45789348,  0.80997934,  1.56454463]])

In [14]:
df2.to_numpy()

array([[1.0, Timestamp('2023-04-10 00:00:00'), 1.0, 3, 'Test', 'foo'],
       [1.0, Timestamp('2023-04-10 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2023-04-10 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2023-04-10 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [15]:
# Describe the data mean,median,count,min,max,std
df.describe()

Unnamed: 0,S,W,X,Y,Z
count,8.0,8.0,8.0,8.0,8.0
mean,0.189944,-0.205066,0.044434,-0.076369,0.446148
std,1.041207,0.937219,1.213383,1.038621,0.920077
min,-0.774915,-2.108835,-1.457893,-2.086899,-0.746256
25%,-0.580112,-0.410398,-0.964133,-0.423634,-0.211577
50%,-0.05722,-0.205099,0.032917,0.416025,0.404372
75%,0.488206,0.244235,1.13881,0.593653,0.837884
max,1.865327,1.123016,1.542141,0.809979,1.893299


In [16]:
# to transpose the data
df2.T

Unnamed: 0,0,1,2,3
A,1.0,1.0,1.0,1.0
B,2023-04-10 00:00:00,2023-04-10 00:00:00,2023-04-10 00:00:00,2023-04-10 00:00:00
C,1.0,1.0,1.0,1.0
D,3,3,3,3
E,Test,train,test,train
F,foo,foo,foo,foo


In [17]:
# sorting
df.sort_index(axis=0, ascending=False)

Unnamed: 0,S,W,X,Y,Z
2023-04-17,-0.007501,-0.223142,-1.457893,0.809979,1.564545
2023-04-16,1.72394,0.159986,-0.819459,-0.160092,1.893299
2023-04-15,-0.106939,0.496983,1.356887,0.625102,-0.39706
2023-04-14,0.076294,-0.531422,-1.398154,0.48507,-0.14975
2023-04-13,-0.724759,-0.187055,0.358464,-2.086899,0.587351
2023-04-12,-0.531896,-2.108835,1.542141,-1.214259,0.595663
2023-04-11,-0.774915,1.123016,1.066117,0.346979,0.221392
2023-04-10,1.865327,-0.370056,-0.292629,0.58317,-0.746256


In [18]:
# sorting by column name
df.sort_values(by="W")

Unnamed: 0,S,W,X,Y,Z
2023-04-12,-0.531896,-2.108835,1.542141,-1.214259,0.595663
2023-04-14,0.076294,-0.531422,-1.398154,0.48507,-0.14975
2023-04-10,1.865327,-0.370056,-0.292629,0.58317,-0.746256
2023-04-17,-0.007501,-0.223142,-1.457893,0.809979,1.564545
2023-04-13,-0.724759,-0.187055,0.358464,-2.086899,0.587351
2023-04-16,1.72394,0.159986,-0.819459,-0.160092,1.893299
2023-04-15,-0.106939,0.496983,1.356887,0.625102,-0.39706
2023-04-11,-0.774915,1.123016,1.066117,0.346979,0.221392


In [19]:
# selecting the data
df['Z']

2023-04-10   -0.746256
2023-04-11    0.221392
2023-04-12    0.595663
2023-04-13    0.587351
2023-04-14   -0.149750
2023-04-15   -0.397060
2023-04-16    1.893299
2023-04-17    1.564545
Freq: D, Name: Z, dtype: float64

In [20]:
# row wise selection
df[2:6]

Unnamed: 0,S,W,X,Y,Z
2023-04-12,-0.531896,-2.108835,1.542141,-1.214259,0.595663
2023-04-13,-0.724759,-0.187055,0.358464,-2.086899,0.587351
2023-04-14,0.076294,-0.531422,-1.398154,0.48507,-0.14975
2023-04-15,-0.106939,0.496983,1.356887,0.625102,-0.39706


In [21]:
# loc - specific indexing
# cross section - row wise indexing
df.loc[dates[0]]

S    1.865327
W   -0.370056
X   -0.292629
Y    0.583170
Z   -0.746256
Name: 2023-04-10 00:00:00, dtype: float64

In [22]:
# column wise selection
df.loc[:,["S","W"]]

Unnamed: 0,S,W
2023-04-10,1.865327,-0.370056
2023-04-11,-0.774915,1.123016
2023-04-12,-0.531896,-2.108835
2023-04-13,-0.724759,-0.187055
2023-04-14,0.076294,-0.531422
2023-04-15,-0.106939,0.496983
2023-04-16,1.72394,0.159986
2023-04-17,-0.007501,-0.223142


In [23]:
# specifying the dates i.e rows
df.loc['20230410':'20230413', ["S","W"]]

Unnamed: 0,S,W
2023-04-10,1.865327,-0.370056
2023-04-11,-0.774915,1.123016
2023-04-12,-0.531896,-2.108835
2023-04-13,-0.724759,-0.187055


In [24]:
df.loc['20230410', ["S","W","X"]]

S    1.865327
W   -0.370056
X   -0.292629
Name: 2023-04-10 00:00:00, dtype: float64

In [25]:
# specifically index position
df.at[dates[4],"S"]

0.0762944863539185

In [26]:
df.iloc[3:10]

Unnamed: 0,S,W,X,Y,Z
2023-04-13,-0.724759,-0.187055,0.358464,-2.086899,0.587351
2023-04-14,0.076294,-0.531422,-1.398154,0.48507,-0.14975
2023-04-15,-0.106939,0.496983,1.356887,0.625102,-0.39706
2023-04-16,1.72394,0.159986,-0.819459,-0.160092,1.893299
2023-04-17,-0.007501,-0.223142,-1.457893,0.809979,1.564545


In [27]:
# 1st is rows and 2nd is columns
df.iloc[0:5, 0:4]

Unnamed: 0,S,W,X,Y
2023-04-10,1.865327,-0.370056,-0.292629,0.58317
2023-04-11,-0.774915,1.123016,1.066117,0.346979
2023-04-12,-0.531896,-2.108835,1.542141,-1.214259
2023-04-13,-0.724759,-0.187055,0.358464,-2.086899
2023-04-14,0.076294,-0.531422,-1.398154,0.48507


In [28]:
df.iloc[:,0:2]

Unnamed: 0,S,W
2023-04-10,1.865327,-0.370056
2023-04-11,-0.774915,1.123016
2023-04-12,-0.531896,-2.108835
2023-04-13,-0.724759,-0.187055
2023-04-14,0.076294,-0.531422
2023-04-15,-0.106939,0.496983
2023-04-16,1.72394,0.159986
2023-04-17,-0.007501,-0.223142


In [29]:
# boolean condition to filter values
df[df["S"] > 0.5]

Unnamed: 0,S,W,X,Y,Z
2023-04-10,1.865327,-0.370056,-0.292629,0.58317,-0.746256
2023-04-16,1.72394,0.159986,-0.819459,-0.160092,1.893299


In [30]:
df[df > 0.5]

Unnamed: 0,S,W,X,Y,Z
2023-04-10,1.865327,,,0.58317,
2023-04-11,,1.123016,1.066117,,
2023-04-12,,,1.542141,,0.595663
2023-04-13,,,,,0.587351
2023-04-14,,,,,
2023-04-15,,,1.356887,0.625102,
2023-04-16,1.72394,,,,1.893299
2023-04-17,,,,0.809979,1.564545


In [31]:
df3 = df.copy()
df3["E"] = ["one","two","one","three","four","one","two","four"]
df3

Unnamed: 0,S,W,X,Y,Z,E
2023-04-10,1.865327,-0.370056,-0.292629,0.58317,-0.746256,one
2023-04-11,-0.774915,1.123016,1.066117,0.346979,0.221392,two
2023-04-12,-0.531896,-2.108835,1.542141,-1.214259,0.595663,one
2023-04-13,-0.724759,-0.187055,0.358464,-2.086899,0.587351,three
2023-04-14,0.076294,-0.531422,-1.398154,0.48507,-0.14975,four
2023-04-15,-0.106939,0.496983,1.356887,0.625102,-0.39706,one
2023-04-16,1.72394,0.159986,-0.819459,-0.160092,1.893299,two
2023-04-17,-0.007501,-0.223142,-1.457893,0.809979,1.564545,four


In [32]:
df3["A"] = df3["S"] + 1
df3

Unnamed: 0,S,W,X,Y,Z,E,A
2023-04-10,1.865327,-0.370056,-0.292629,0.58317,-0.746256,one,2.865327
2023-04-11,-0.774915,1.123016,1.066117,0.346979,0.221392,two,0.225085
2023-04-12,-0.531896,-2.108835,1.542141,-1.214259,0.595663,one,0.468104
2023-04-13,-0.724759,-0.187055,0.358464,-2.086899,0.587351,three,0.275241
2023-04-14,0.076294,-0.531422,-1.398154,0.48507,-0.14975,four,1.076294
2023-04-15,-0.106939,0.496983,1.356887,0.625102,-0.39706,one,0.893061
2023-04-16,1.72394,0.159986,-0.819459,-0.160092,1.893299,two,2.72394
2023-04-17,-0.007501,-0.223142,-1.457893,0.809979,1.564545,four,0.992499


In [35]:
# taking mean of each row and saving the result
df3['mean'] = df3[['S', 'W', 'X', 'Y', 'Z']].mean(axis=1)
df3

Unnamed: 0,S,W,X,Y,Z,E,A,mean
2023-04-10,1.865327,-0.370056,-0.292629,0.58317,-0.746256,one,2.865327,0.207911
2023-04-11,-0.774915,1.123016,1.066117,0.346979,0.221392,two,0.225085,0.396518
2023-04-12,-0.531896,-2.108835,1.542141,-1.214259,0.595663,one,0.468104,-0.343437
2023-04-13,-0.724759,-0.187055,0.358464,-2.086899,0.587351,three,0.275241,-0.41058
2023-04-14,0.076294,-0.531422,-1.398154,0.48507,-0.14975,four,1.076294,-0.303592
2023-04-15,-0.106939,0.496983,1.356887,0.625102,-0.39706,one,0.893061,0.394995
2023-04-16,1.72394,0.159986,-0.819459,-0.160092,1.893299,two,2.72394,0.559535
2023-04-17,-0.007501,-0.223142,-1.457893,0.809979,1.564545,four,0.992499,0.137198


In [39]:
# now displaying the result which has greater than 0 mean
df3[df3["mean"] > 0]

Unnamed: 0,S,W,X,Y,Z,E,A,mean
2023-04-10,1.865327,-0.370056,-0.292629,0.58317,-0.746256,one,2.865327,0.207911
2023-04-11,-0.774915,1.123016,1.066117,0.346979,0.221392,two,0.225085,0.396518
2023-04-15,-0.106939,0.496983,1.356887,0.625102,-0.39706,one,0.893061,0.394995
2023-04-16,1.72394,0.159986,-0.819459,-0.160092,1.893299,two,2.72394,0.559535
2023-04-17,-0.007501,-0.223142,-1.457893,0.809979,1.564545,four,0.992499,0.137198


In [1]:
import pandas as pd

#create DataFrame
df = pd.DataFrame({'player': ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'],
                   'game1': [18, 22, 19, 14, 14, 11, 20, 28],
                   'game2': [5, 7, 7, 9, 12, 9, 9, 4],
                   'game3': [11, 8, 10, 6, 6, 5, 9, 12],
                   'game4': [9, 8, 10, 9, 14, 15, 10, 11]})

df

Unnamed: 0,player,game1,game2,game3,game4
0,A,18,5,11,9
1,B,22,7,8,8
2,C,19,7,10,10
3,D,14,9,6,9
4,E,14,12,6,14
5,F,11,9,5,15
6,G,20,9,9,10
7,H,28,4,12,11


In [3]:
df.mean(numeric_only=True)

game1    18.250
game2     7.750
game3     8.375
game4    10.750
dtype: float64

In [4]:
df.median(numeric_only=True)

game1    18.5
game2     8.0
game3     8.5
game4    10.0
dtype: float64

In [5]:
df.mode(numeric_only=True)

Unnamed: 0,game1,game2,game3,game4
0,14.0,9.0,6.0,9
1,,,,10
