# Pandas DataFrames Snippets for Data Analysis

## 1. Create DataFrame

pandas.DataFrame(data=None, index=None, columns=None, dtype=None, copy=None)

### 1. From Dictionary of Keys,Values

In [1]:
import pandas as pd
import numpy as np

In [3]:
data = {'Name': ['Vishwas','Saahil','Ashish','Yadneet'], 
         'Age': [25,25,25,24], 
         'Salary': [20000,30000,40000,50000]}

In [4]:
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,Salary
0,Vishwas,25,20000
1,Saahil,25,30000
2,Ashish,25,40000
3,Yadneet,24,50000


### 2. From list of Values as each row

In [5]:
# Passing Columns as nested lists of each row

data = [['Vishwas',25,20000],['Saahil',25,30000],['Ashish',25,40000],['Yadneet',24,50000]]
columns = ['Name','Age','Salary']

In [6]:
df1 = pd.DataFrame(data,columns=columns)
df1

Unnamed: 0,Name,Age,Salary
0,Vishwas,25,20000
1,Saahil,25,30000
2,Ashish,25,40000
3,Yadneet,24,50000


### 3. From list of Values as separate lists of data

In [38]:
# Passing Columns as separate lists for each column data and zipping

names = ['Vishwas','Saahil','Ashish','Yadneet']
age = [25,25,25,24]
salary = [20000,30000,40000,50000]
columns = ['Name','Age','Salary']

In [39]:
df2 = pd.DataFrame(list(zip(names,age,salary)), columns=columns)
df2

Unnamed: 0,Name,Age,Salary
0,Vishwas,25,20000
1,Saahil,25,30000
2,Ashish,25,40000
3,Yadneet,24,50000


### 4. With column and index specified in dict

In [9]:
# Passing columns and index both in nested dictionary for analysis purpose

dict = {"IT": {'Name': 'Vishwas','Age': 25, 'Salary': 25000},
        "Mechanical": {'Name': 'Saahil', 'Age': 25,'Salary': 30000}}

In [10]:
df3 = pd.DataFrame(dict)
df3

Unnamed: 0,IT,Mechanical
Name,Vishwas,Saahil
Age,25,25
Salary,25000,30000


## 2. DataFrame Attributes

### 1. Transpose

In [11]:
# Change the index and columns of the dataframe

df3.T

Unnamed: 0,Name,Age,Salary
IT,Vishwas,25,25000
Mechanical,Saahil,25,30000


### 2. Shape

In [12]:
# Gets the number of rows and columns of dataframe

df3.shape

(3, 2)

### 3. Size

In [13]:
# Gets the total number of elements in the dataframe

df3.size

6

### 4. nlargest

In [14]:
# Get top 2 largest column value from dataframe

df2.nlargest(2,'Salary')

Unnamed: 0,Name,Age,Salary
3,Yadneet,24,50000
2,Ashish,25,40000


### 5. Info

In [15]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    4 non-null      object
 1   Age     4 non-null      int64 
 2   Salary  4 non-null      int64 
dtypes: int64(2), object(1)
memory usage: 224.0+ bytes


### 6. Loc

In [16]:
df3

Unnamed: 0,IT,Mechanical
Name,Vishwas,Saahil
Age,25,25
Salary,25000,30000


In [17]:
df3.loc[['Name']]

Unnamed: 0,IT,Mechanical
Name,Vishwas,Saahil


In [18]:
df3.loc[:,['Mechanical']]

Unnamed: 0,Mechanical
Name,Saahil
Age,25
Salary,30000


In [19]:
df3.index

Index(['Name', 'Age', 'Salary'], dtype='object')

### 7. Insert

In [20]:
# Insert a new column Media at 3rd position in dataframe

df3.insert(2,'Media',['Yadneet',24,40000])
df3

Unnamed: 0,IT,Mechanical,Media
Name,Vishwas,Saahil,Yadneet
Age,25,25,24
Salary,25000,30000,40000


### 8. Sort Values

DataFrame.sort_values(by, axis=0, ascending=True, inplace=False, kind=’quicksort’, na_position=’last’)


by: Single/List of column names to sort Data Frame by.

axis: 0 or ‘index’ for rows and 1 or ‘columns’ for Column.

ascending: Boolean value which sorts Data frame in ascending order if True.

inplace: Boolean value. Makes the changes in passed data frame itself if True.

kind: String which can have three inputs(‘quicksort’, ‘mergesort’ or ‘heapsort’) of algorithm used to sort data frame.

na_position: Takes two string input ‘last’ or ‘first’ to set position of Null values. Default is ‘last’.

In [21]:
df2

Unnamed: 0,Name,Age,Salary
0,Vishwas,25,20000
1,Saahil,25,30000
2,Ashish,25,40000
3,Yadneet,24,50000


In [22]:
df2.sort_values('Name')

Unnamed: 0,Name,Age,Salary
2,Ashish,25,40000
1,Saahil,25,30000
0,Vishwas,25,20000
3,Yadneet,24,50000


### 9. Where

In [23]:
# Return all data and null values where condition does not match ans substitue with 0.

df2.where(df2['Salary'] > 20000, 0)

Unnamed: 0,Name,Age,Salary
0,0,0,0
1,Saahil,25,30000
2,Ashish,25,40000
3,Yadneet,24,50000


### 10. Filter

In [24]:
# Return subset of columns

df2.filter(['Salary'])

Unnamed: 0,Salary
0,20000
1,30000
2,40000
3,50000


### 11. astype()

In [26]:
df2['Salary'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 4 entries, 0 to 3
Series name: Salary
Non-Null Count  Dtype
--------------  -----
4 non-null      int64
dtypes: int64(1)
memory usage: 160.0 bytes


In [27]:
df2.Salary.astype('int')

0    20000
1    30000
2    40000
3    50000
Name: Salary, dtype: int32

### 12. to_datetime()

In [None]:
# data_1['DOB'] = pd.to_datetime(data_1['DOB'])

### 13. value_counts()

In [28]:
df2['Age'].value_counts()

25    3
24    1
Name: Age, dtype: int64

### 14. drop_duplicates()

In [37]:
# drops second row if all row values are same

df2.drop_duplicates()

Unnamed: 0,Name,Age,Salary
0,Vishwas,25,20000
1,Saahil,25,30000
2,Ashish,25,40000
3,Yadneet,24,50000
5,Vishwas,25,60000


### 15. append a new row in dataframe with list of values

In [55]:
new = ['Vishwas',25,60000]

In [56]:
df2.loc[len(df2)] = new

In [57]:
df2

Unnamed: 0,Name,Age,Salary
0,Vishwas,25,20000
1,Saahil,25,30000
2,Ashish,25,40000
3,Yadneet,24,50000
4,Vishwas,25,60000


### 16. Drop

In [60]:
df2.drop(4)

Unnamed: 0,Name,Age,Salary
0,Vishwas,25,20000
1,Saahil,25,30000
2,Ashish,25,40000
3,Yadneet,24,50000


### 17. groupby()

df2.groupby('Salary').mean()

### 18. merge()

In [62]:
# data_1.merge(data_2, on='Name', how='left')

### 19. isnull()

In [63]:
df2.isnull()

Unnamed: 0,Name,Age,Salary
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False


### 20. fillna()

In [65]:
# fills 15000 as default values for null values in Salary column

df2['Salary'].fillna(15000)

0    20000
1    30000
2    40000
3    50000
4    60000
Name: Salary, dtype: int64

### 21. Reset Index to start from 0

In [None]:
df2.reset_index()

### 22. Reset Index to start from 0 and drop new Index column generated

In [None]:
# drop=True - Drops the index column which is newly created after reset index
# inplace=True - Makes the changes in the dataframe itslef and does not retrun new dataframe

df2.reset_index(drop=True,inplace=True)

### 23. Join Columns of both Dataframes

In [None]:
joindf = df1.join(df2)

## 3. Dataframe transformatiions

### 1. Read CSV files to dataframe

In [67]:
fifa = pd.read_csv('Notebooks/fifa_data.csv')

In [69]:
fifa.head(5)

Unnamed: 0.1,Unnamed: 0,ID,Name,Age,Photo,Nationality,Flag,Overall,Potential,Club,...,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Release Clause
0,0,158023,L. Messi,31,https://cdn.sofifa.org/players/4/19/158023.png,Argentina,https://cdn.sofifa.org/flags/52.png,94,94,FC Barcelona,...,96.0,33.0,28.0,26.0,6.0,11.0,15.0,14.0,8.0,€226.5M
1,1,20801,Cristiano Ronaldo,33,https://cdn.sofifa.org/players/4/19/20801.png,Portugal,https://cdn.sofifa.org/flags/38.png,94,94,Juventus,...,95.0,28.0,31.0,23.0,7.0,11.0,15.0,14.0,11.0,€127.1M
2,2,190871,Neymar Jr,26,https://cdn.sofifa.org/players/4/19/190871.png,Brazil,https://cdn.sofifa.org/flags/54.png,92,93,Paris Saint-Germain,...,94.0,27.0,24.0,33.0,9.0,9.0,15.0,15.0,11.0,€228.1M
3,3,193080,De Gea,27,https://cdn.sofifa.org/players/4/19/193080.png,Spain,https://cdn.sofifa.org/flags/45.png,91,93,Manchester United,...,68.0,15.0,21.0,13.0,90.0,85.0,87.0,88.0,94.0,€138.6M
4,4,192985,K. De Bruyne,27,https://cdn.sofifa.org/players/4/19/192985.png,Belgium,https://cdn.sofifa.org/flags/7.png,91,92,Manchester City,...,88.0,68.0,58.0,51.0,15.0,13.0,5.0,10.0,13.0,€196.4M


### 2. Get only required columns for processing

In [83]:
fifa = fifa.loc[:100,['Name','Age','Nationality','Club','Value','Position']]
fifa.head(5)

Unnamed: 0,Name,Age,Nationality,Club,Value,Position
0,L. Messi,31,Argentina,FC Barcelona,€110.5M,RF
1,Cristiano Ronaldo,33,Portugal,Juventus,€77M,ST
2,Neymar Jr,26,Brazil,Paris Saint-Germain,€118.5M,LW
3,De Gea,27,Spain,Manchester United,€72M,GK
4,K. De Bruyne,27,Belgium,Manchester City,€102M,RCM


### 3. Transform Value column to only integers

In [94]:
fifa['Value'] = fifa['Value'].map(lambda x: x.lstrip('€').rstrip('M'))

In [96]:
fifa.head(5)

Unnamed: 0,Name,Age,Nationality,Club,Value,Position
0,L. Messi,31,Argentina,FC Barcelona,110.5,RF
1,Cristiano Ronaldo,33,Portugal,Juventus,77.0,ST
2,Neymar Jr,26,Brazil,Paris Saint-Germain,118.5,LW
3,De Gea,27,Spain,Manchester United,72.0,GK
4,K. De Bruyne,27,Belgium,Manchester City,102.0,RCM


### 4. Save dataframe as CSV file

In [97]:
fifa.to_csv('Fifa_cleaned.csv')