https://www.datacamp.com/community/tutorials/pandas-tutorial-dataframe-python#gs.B05f3fs

# 1. How To Create a Pandas DataFrame

In [1]:
import numpy as np
import pandas as pd

### Array to DataFrame

In [2]:
data = np.array([['','Col1','Col2'],
                ['Row1',1,2],
                ['Row2',3,4]])
print(data)

[['' 'Col1' 'Col2']
 ['Row1' '1' '2']
 ['Row2' '3' '4']]


In [3]:
print(pd.DataFrame(data=data[1:,1:],
                  index=data[1:,0],
                  columns=data[0,1:]))

     Col1 Col2
Row1    1    2
Row2    3    4


In [4]:
my_2darray = np.array([[1, 2, 3], [4, 5, 6]])
print(pd.DataFrame(my_2darray))

   0  1  2
0  1  2  3
1  4  5  6


### Dictionary to DataFrame

In [5]:
my_dict = {1: ['1', '3'], 2: ['1', '2'], 3: ['2', '4']}
print(pd.DataFrame(my_dict))

   1  2  3
0  1  1  2
1  3  2  4


### DataFrame to DataFrame

In [6]:
my_df = pd.DataFrame(data=[4,5,6,7], index=range(0,4), columns=['A'])
print(pd.DataFrame(my_df))

   A
0  4
1  5
2  6
3  7


### Series to DataFrame

In [7]:
my_series = pd.Series({"United Kingdom":"London", "India":"New Delhi", "United States":"Washington", "Belgium":"Brussels"})
print(pd.DataFrame(my_series))

                         0
Belgium           Brussels
India            New Delhi
United Kingdom      London
United States   Washington


In [8]:
df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6]]))
print(df)

   0  1  2
0  1  2  3
1  4  5  6


In [9]:
print(len(df))

2


In [10]:
print(df.index)
print(len(df.index))

RangeIndex(start=0, stop=2, step=1)
2


In [11]:
df.shape

(2, 3)

# 2. How To Select an Index or Column From a Pandas DataFrame

In [12]:
df = pd.DataFrame(data=[[1,2,3],[4,5,6],[7,8,9]], index=[0,1,2], columns=['A','B','C'])
print(df)

   A  B  C
0  1  2  3
1  4  5  6
2  7  8  9


### Access element

In [13]:
# Using `iloc[]`
print(df.iloc[0][0])

# Using `loc[]`
print(df.loc[0]['A'])

# Using `at[]`
print(df.at[0,'A'])

# Using `iat[]`
print(df.iat[0,0])

# Using `get_value(index, column)`
print(df.get_value(0, 'A'))

1
1
1
1
1


### Row Selection and Column Selection

In [14]:
# Select Row
print(df.iloc[0])

A    1
B    2
C    3
Name: 0, dtype: int64


In [15]:
# Select Column
print(df.loc[:,'A'])

0    1
1    4
2    7
Name: A, dtype: int64


# 3. How To Add an Index, Row or Column to a Pandas DataFrame

### Adding an Index to a DataFrame

In [16]:
print(df)

   A  B  C
0  1  2  3
1  4  5  6
2  7  8  9


In [17]:
df.set_index('C')

Unnamed: 0_level_0,A,B
C,Unnamed: 1_level_1,Unnamed: 2_level_1
3,1,2
6,4,5
9,7,8


### Adding Rows to a DataFrame

In [18]:
df = pd.DataFrame(data=np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), index= [2, 'A', 4], columns=[48, 49, 50])
print(df)

   48  49  50
2   1   2   3
A   4   5   6
4   7   8   9


**loc** works on labels of your index.

In [19]:
print(df.loc[2])

48    1
49    2
50    3
Name: 2, dtype: int64


**iloc** works on the positions in your index. 

In [20]:
print(df.iloc[2])

48    7
49    8
50    9
Name: 4, dtype: int64


**ix** is a more complex case:      
when the index is integer-based, works on labels of your index, like **loc**     
when the index is not integer-based, works on positions of your index, like **iloc** 

In [21]:
print(df.ix[2])

48    7
49    8
50    9
Name: 4, dtype: int64


In [22]:
df = pd.DataFrame(data=np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), index= [2.5, 12.6, 4.8], columns=[48, 49, 50])
print(df)

      48  49  50
2.5    1   2   3
12.6   4   5   6
4.8    7   8   9


In [23]:
df.ix[2] = [60, 50, 40]
print(df)

      48  49  50
2.5    1   2   3
12.6   4   5   6
4.8   60  50  40


In [24]:
df.loc[2] = [11, 12, 13]
print(df)

      48  49  50
2.5    1   2   3
12.6   4   5   6
4.8   60  50  40
2.0   11  12  13


### Adding a Column to Your DataFrame

In [25]:
df = pd.DataFrame(data=np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), columns=['A', 'B', 'C'])
print(df)

   A  B  C
0  1  2  3
1  4  5  6
2  7  8  9


In [26]:
df['D'] = df.index
print(df)

   A  B  C  D
0  1  2  3  0
1  4  5  6  1
2  7  8  9  2


In [27]:
df.loc[:, 4] = pd.Series(['5', '6', '7'], index=df.index)
print(df)

   A  B  C  D  4
0  1  2  3  0  5
1  4  5  6  1  6
2  7  8  9  2  7


### Resetting the Index of Your DataFrame

In [28]:
df.reset_index(level=0, drop=True)
print(df)

   A  B  C  D  4
0  1  2  3  0  5
1  4  5  6  1  6
2  7  8  9  2  7


# 4. How to Delete Indices, Rows or Columns From a Pandas Data Frame

### Deleting duplicate from Your DataFrame

In [29]:
df = pd.DataFrame(data=np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [40, 50, 60], [23, 35, 37]]), 
                  index= [2.5, 12.6, 4.8, 4.8, 2.5], 
                  columns=[48, 49, 50])
print(df)

      48  49  50
2.5    1   2   3
12.6   4   5   6
4.8    7   8   9
4.8   40  50  60
2.5   23  35  37


In [30]:
df.reset_index().drop_duplicates(subset='index', keep='last').set_index('index')

Unnamed: 0_level_0,48,49,50
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
12.6,4,5,6
4.8,40,50,60
2.5,23,35,37


### Deleting a Column from Your DataFrame

In [31]:
df = pd.DataFrame(data=np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), columns=['A', 'B', 'C'])
print(df)

   A  B  C
0  1  2  3
1  4  5  6
2  7  8  9


In [32]:
df.drop(df.columns[[1]], axis=1)
print(df)

   A  B  C
0  1  2  3
1  4  5  6
2  7  8  9


### Removing a Row from Your DataFrame

In [33]:
df = pd.DataFrame(data=np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), columns=['A', 'B', 'C'])
print(df)

   A  B  C
0  1  2  3
1  4  5  6
2  7  8  9


In [34]:
df.drop(df.index[1])
print(df)
print(    )
df.new=df.drop(df.index[1])
print(df.new)

   A  B  C
0  1  2  3
1  4  5  6
2  7  8  9

   A  B  C
0  1  2  3
2  7  8  9


# How to Rename the Index or Columns of a Pandas DataFrame

In [35]:
df = pd.DataFrame(data=np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), columns=['A', 'B', 'C'])
print(df)

   A  B  C
0  1  2  3
1  4  5  6
2  7  8  9


In [36]:
newcols = {
    'A': 'new_column_1', 
    'B': 'new_column_2', 
    'C': 'new_column_3'
}
df.rename(columns=newcols, inplace=True)
print(df)

   new_column_1  new_column_2  new_column_3
0             1             2             3
1             4             5             6
2             7             8             9


In [37]:
df.rename(index={1: 'a'}, inplace=True)
print(df)

   new_column_1  new_column_2  new_column_3
0             1             2             3
a             4             5             6
2             7             8             9


# 6. How To Format The Data in Your Pandas DataFrame

### Replacing All Occurrences of a String in a DataFrame

In [38]:
df = pd.DataFrame(data=np.array([['OK','Perfect', 'Acceptable'], 
                                 ['Awful','Awful','Perfect'], 
                                 ['Acceptable', 'OK', 'Poor']]), 
                  columns=['Student1', 'Student2', 'Student3'])
print(df)

     Student1 Student2    Student3
0          OK  Perfect  Acceptable
1       Awful    Awful     Perfect
2  Acceptable       OK        Poor


In [39]:
print(df.replace(['Awful', 'Poor', 'OK', 'Acceptable', 'Perfect'], [0, 1, 2, 3, 4]) )

   Student1  Student2  Student3
0         2         4         3
1         0         0         4
2         3         2         1


In [40]:
df = pd.DataFrame(data=np.array([['1\n', 2, '3\n'], [4, 5, '6\n'], [7, '8\n', 9]]))
print(df)

     0    1    2
0  1\n    2  3\n
1    4    5  6\n
2    7  8\n    9


In [41]:
print(df.replace({'\n': '<br>'}, regex=True))

       0      1      2
0  1<br>      2  3<br>
1      4      5  6<br>
2      7  8<br>      9


### Removing Parts From Strings in the Cells of Your DataFrame

In [42]:
df = pd.DataFrame(data=np.array([[1, 2, '+3b'], [4, 5, '-6B'], [7, 8, '+9A']]),columns=['class','test','result'])
print(df)

  class test result
0     1    2    +3b
1     4    5    -6B
2     7    8    +9A


In [43]:
df['result'] = df['result'].map(lambda x: x.lstrip('+-').rstrip('aAbBcC'))
df

Unnamed: 0,class,test,result
0,1,2,3
1,4,5,6
2,7,8,9


### Applying A Function to Your Pandas DataFrame’s Columns or Rows

In [44]:
df = pd.DataFrame(data=np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), columns=['A', 'B', 'C'])
print(df)

   A  B  C
0  1  2  3
1  4  5  6
2  7  8  9


In [45]:
doubler = lambda x: x*2
df['A'].apply(doubler)

0     2
1     8
2    14
Name: A, dtype: int64

In [46]:
print(df.applymap(doubler))

    A   B   C
0   2   4   6
1   8  10  12
2  14  16  18


# 7 How To Create an Empty DataFrame

In [47]:
df = pd.DataFrame(np.nan, index=[0,1,2,3], columns=['A'])
print(df)

    A
0 NaN
1 NaN
2 NaN
3 NaN


In [48]:
df = pd.DataFrame(index=range(0,4),columns=['A'], dtype='float')
print(df)

    A
0 NaN
1 NaN
2 NaN
3 NaN


# 8. Reshape Your Pandas DataFrame

### Pivotting Your DataFrame

In [49]:
products = pd.DataFrame({'category': ['Cleaning', 'Cleaning', 'Entertainment', 'Entertainment', 'Tech', 'Tech'],
                        'store': ['Walmart', 'Dia', 'Walmart', 'Fnac', 'Dia','Walmart'],
                        'price':[11.42, 23.50, 19.99, 15.95, 55.75, 111.55],
                        'testscore': [4, 3, 5, 7, 5, 8]})
print(products)

        category   price    store  testscore
0       Cleaning   11.42  Walmart          4
1       Cleaning   23.50      Dia          3
2  Entertainment   19.99  Walmart          5
3  Entertainment   15.95     Fnac          7
4           Tech   55.75      Dia          5
5           Tech  111.55  Walmart          8


In [50]:
pivot_products = products.pivot(index='category', columns='store', values='price')
print(pivot_products)

store            Dia   Fnac  Walmart
category                            
Cleaning       23.50    NaN    11.42
Entertainment    NaN  15.95    19.99
Tech           55.75    NaN   111.55


In [51]:
pivot_products = products.pivot(index='category', columns='store')
print(pivot_products)

               price                testscore             
store            Dia   Fnac Walmart       Dia Fnac Walmart
category                                                  
Cleaning       23.50    NaN   11.42       3.0  NaN     4.0
Entertainment    NaN  15.95   19.99       NaN  7.0     5.0
Tech           55.75    NaN  111.55       5.0  NaN     8.0


### Reshaping Your DataFrame With **Melt()**

In [52]:
people = pd.DataFrame({'FirstName' : ['John', 'Jane'],
                       'LastName' : ['Doe', 'Austen'],
                       'BloodType' : ['A-', 'B+'],
                       'Weight' : [90, 64]})
print(people)

  BloodType FirstName LastName  Weight
0        A-      John      Doe      90
1        B+      Jane   Austen      64


In [53]:
print(pd.melt(people, id_vars=['FirstName', 'LastName'], var_name='measurements'))

  FirstName LastName measurements value
0      John      Doe    BloodType    A-
1      Jane   Austen    BloodType    B+
2      John      Doe       Weight    90
3      Jane   Austen       Weight    64


# 9. How To Iterate Over a Pandas DataFrame

In [54]:
df = pd.DataFrame(data=np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), columns=['A', 'B', 'C'])
print(df)

   A  B  C
0  1  2  3
1  4  5  6
2  7  8  9


In [55]:
for index, row in df.iterrows() :
    print(row['A'], row['B'])

1 2
4 5
7 8
