### Taking data from website (in string format)

In [6]:
string = """7698,'BLAKE','MANAGER',7839,'1-MAY-81',2850,NULL,30
7369,'SMITH','CLERK',7902,'17-DEC-80',800,NULL,20
7788,'SCOTT','ANALYST',7566,'09-DEC-82',3000,NULL,20
7934,'MILLER','CLERK',7782,'23-JAN-82',1300,NULL,10
7654,'MARTIN','SALESMAN',7698,'28-SEP-81',1250,1400,30
7499,'ALLEN','SALESMAN',7698,'20-FEB-81',1600,300,30
7782,'CLARK','MANAGER',7839,'9-JUN-81',2450,NULL,10
7844,'TURNER','SALESMAN',7698,'8-SEP-81',1500,0,30
7900,'JAMES','CLERK',7698,'3-DEC-81',950,NULL,30
7521,'WARD','SALESMAN',7698,'22-FEB-81',1250,500,30
7902,'FORD','ANALYST',7566,'3-DEC-81',3000,NULL,20
7876,'ADAMS','CLERK',7788,'12-JAN-83',1100,NULL,20
7566,'JONES','MANAGER',7839,'2-APR-81',2975,NULL,20
7698,'BLAKE','MANAGER',7839,'1-MAY-81',2850,NULL,30
7369,'SMITH','CLERK',7902,'17-DEC-80',800,NULL,20
7788,'SCOTT','ANALYST',7566,'09-DEC-82',3000,NULL,20
7654,'MARTIN','SALESMAN',7698,'28-SEP-81',1250,1400,30
7499,'ALLEN','SALESMAN',7698,'20-FEB-81',1600,300,30
7839,'KING-WEB','PRESIDENT',NULL,'17-NOV-81',15000,NULL,10
7782,'CLARK','MANAGER',7839,'9-JUN-81',2450,NULL,10
7844,'TURNER','SALESMAN',7698,'8-SEP-81',1500,0,30
7900,'JAMES','CLERK',7698,'3-DEC-81',950,NULL,30"""

### Converting string data to list to clean and remove duplicate values 

In [7]:
string = string.split('\n')
print("Before:")
sorted(string)

Before:


["7369,'SMITH','CLERK',7902,'17-DEC-80',800,NULL,20",
 "7369,'SMITH','CLERK',7902,'17-DEC-80',800,NULL,20",
 "7499,'ALLEN','SALESMAN',7698,'20-FEB-81',1600,300,30",
 "7499,'ALLEN','SALESMAN',7698,'20-FEB-81',1600,300,30",
 "7521,'WARD','SALESMAN',7698,'22-FEB-81',1250,500,30",
 "7566,'JONES','MANAGER',7839,'2-APR-81',2975,NULL,20",
 "7654,'MARTIN','SALESMAN',7698,'28-SEP-81',1250,1400,30",
 "7654,'MARTIN','SALESMAN',7698,'28-SEP-81',1250,1400,30",
 "7698,'BLAKE','MANAGER',7839,'1-MAY-81',2850,NULL,30",
 "7698,'BLAKE','MANAGER',7839,'1-MAY-81',2850,NULL,30",
 "7782,'CLARK','MANAGER',7839,'9-JUN-81',2450,NULL,10",
 "7782,'CLARK','MANAGER',7839,'9-JUN-81',2450,NULL,10",
 "7788,'SCOTT','ANALYST',7566,'09-DEC-82',3000,NULL,20",
 "7788,'SCOTT','ANALYST',7566,'09-DEC-82',3000,NULL,20",
 "7839,'KING-WEB','PRESIDENT',NULL,'17-NOV-81',15000,NULL,10",
 "7844,'TURNER','SALESMAN',7698,'8-SEP-81',1500,0,30",
 "7844,'TURNER','SALESMAN',7698,'8-SEP-81',1500,0,30",
 "7876,'ADAMS','CLERK',7788,'12-JAN-8

### Removing duplicate data by converting sorted list to set, and back to list

In [8]:
string = list(set(sorted(string)))
print("After")
sorted(string)

After


["7369,'SMITH','CLERK',7902,'17-DEC-80',800,NULL,20",
 "7499,'ALLEN','SALESMAN',7698,'20-FEB-81',1600,300,30",
 "7521,'WARD','SALESMAN',7698,'22-FEB-81',1250,500,30",
 "7566,'JONES','MANAGER',7839,'2-APR-81',2975,NULL,20",
 "7654,'MARTIN','SALESMAN',7698,'28-SEP-81',1250,1400,30",
 "7698,'BLAKE','MANAGER',7839,'1-MAY-81',2850,NULL,30",
 "7782,'CLARK','MANAGER',7839,'9-JUN-81',2450,NULL,10",
 "7788,'SCOTT','ANALYST',7566,'09-DEC-82',3000,NULL,20",
 "7839,'KING-WEB','PRESIDENT',NULL,'17-NOV-81',15000,NULL,10",
 "7844,'TURNER','SALESMAN',7698,'8-SEP-81',1500,0,30",
 "7876,'ADAMS','CLERK',7788,'12-JAN-83',1100,NULL,20",
 "7900,'JAMES','CLERK',7698,'3-DEC-81',950,NULL,30",
 "7902,'FORD','ANALYST',7566,'3-DEC-81',3000,NULL,20",
 "7934,'MILLER','CLERK',7782,'23-JAN-82',1300,NULL,10"]

### Converting data to a proper dataframe, and further preprocessing.
Here the column heads are 'number', 'name', 'designation', 'manager id', 'date of joining(doj)', 'salary(sal)', 'commission' and 'department number(deptno)'. 
Further preprocessing involves removing unnecessary quotes. 
#### The cleaned data is stored in an emp_clean.csv file

In [10]:
import pandas as pd
import os

titles = {'no': [],'name': [],'designation': [],'managerid': [],'doj': [],'sal': [],'commission': [],'deptno': []}
for i in sorted(string):
    li = i.split(',')
    for title, val in zip(titles, li):
        if title == 'name' or title == 'designation' or title == 'doj':
            val = val[1:-1]
        titles[title].append(val)

data = pd.DataFrame(titles)
data.to_csv(os.path.join(os.getcwd(), 'emp_clean.csv'), index=False)

### The data is grouped according to 'designation'.

In [11]:
group = data.groupby('designation')
print(group.groups)
group.get_group('PRESIDENT')

{'ANALYST': Int64Index([7, 12], dtype='int64'), 'CLERK': Int64Index([0, 10, 11, 13], dtype='int64'), 'MANAGER': Int64Index([3, 5, 6], dtype='int64'), 'PRESIDENT': Int64Index([8], dtype='int64'), 'SALESMAN': Int64Index([1, 2, 4, 9], dtype='int64')}


Unnamed: 0,no,name,designation,managerid,doj,sal,commission,deptno
8,7839,KING-WEB,PRESIDENT,,17-NOV-81,15000,,10


### The grouped data is pickled in an 'emps.dat' file

In [12]:
import pickle as pkl

file = os.path.join(os.getcwd(), 'emps.dat')
with open(file, "wb") as f:
    pkl.dump(group, f)

### Loading the data from 'emps.dat' file
In the next cell, we are checking the unpickled data from the 'emps.dat'.

In [13]:
import pickle as pkl

file = None
for i in os.listdir(os.getcwd()):
    if ".dat" in i:
        file = i
        break
        
with open(file, "rb") as f:
    data_load = pkl.load(f)
    f.close()

In [14]:
print(data_load, end="\n\n")
for i in data_load.groups:
    print(i, data_load.groups[i])

print("\n\n")
for i in data_load.groups:
    print(data_load.get_group(i), end="\n\n")

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fc2f7ada430>

ANALYST Int64Index([7, 12], dtype='int64')
CLERK Int64Index([0, 10, 11, 13], dtype='int64')
MANAGER Int64Index([3, 5, 6], dtype='int64')
PRESIDENT Int64Index([8], dtype='int64')
SALESMAN Int64Index([1, 2, 4, 9], dtype='int64')



      no   name designation managerid        doj   sal commission deptno
7   7788  SCOTT     ANALYST      7566  09-DEC-82  3000       NULL     20
12  7902   FORD     ANALYST      7566   3-DEC-81  3000       NULL     20

      no    name designation managerid        doj   sal commission deptno
0   7369   SMITH       CLERK      7902  17-DEC-80   800       NULL     20
10  7876   ADAMS       CLERK      7788  12-JAN-83  1100       NULL     20
11  7900   JAMES       CLERK      7698   3-DEC-81   950       NULL     30
13  7934  MILLER       CLERK      7782  23-JAN-82  1300       NULL     10

     no   name designation managerid       doj   sal commission deptno
3  7566  JONES     MANAGER      783

### Zipping three arrays and finding the max element
In this code, we zip 3 arrays together and find the most frequent element(s). The arrays are created using `randint()` function. 

In [15]:
from random import randint
from collections import Counter
import numpy as np

random_list = [np.array([randint(1, 20) for i in range(15)], dtype='int64') for i in range(3)]
[a, b, c] = random_list
freq = {}

for i, j, k in zip(a, b, c):
    for val in [i, j, k]:
        if val in freq:
            freq[val] += 1
        else: 
            freq[val] = 1
            
for i in random_list:
    print(sorted(i))
    
print(freq)

freq = Counter(freq).most_common()
max_freq = -1; freq_ele = []
for i in freq:
    if i[1] >= max_freq:
        max_freq = i[1]
        freq_ele.append(i[0])
        continue
    break

freq_ele

[4, 4, 8, 8, 8, 9, 9, 14, 14, 17, 19, 19, 19, 20, 20]
[1, 2, 3, 5, 5, 5, 5, 5, 6, 14, 14, 18, 19, 20, 20]
[4, 5, 5, 6, 8, 8, 10, 10, 12, 14, 14, 16, 19, 19, 20]
{4: 3, 3: 1, 10: 2, 8: 5, 20: 5, 12: 1, 14: 6, 5: 7, 19: 6, 17: 1, 6: 2, 9: 2, 1: 1, 18: 1, 2: 1, 16: 1}


[5]

### List Comprehension
1. Creating list of all the employees having without commission(NULL)
2. Creating list of all the employees in department number 10 and 20

In [16]:
data

Unnamed: 0,no,name,designation,managerid,doj,sal,commission,deptno
0,7369,SMITH,CLERK,7902.0,17-DEC-80,800,,20
1,7499,ALLEN,SALESMAN,7698.0,20-FEB-81,1600,300.0,30
2,7521,WARD,SALESMAN,7698.0,22-FEB-81,1250,500.0,30
3,7566,JONES,MANAGER,7839.0,2-APR-81,2975,,20
4,7654,MARTIN,SALESMAN,7698.0,28-SEP-81,1250,1400.0,30
5,7698,BLAKE,MANAGER,7839.0,1-MAY-81,2850,,30
6,7782,CLARK,MANAGER,7839.0,9-JUN-81,2450,,10
7,7788,SCOTT,ANALYST,7566.0,09-DEC-82,3000,,20
8,7839,KING-WEB,PRESIDENT,,17-NOV-81,15000,,10
9,7844,TURNER,SALESMAN,7698.0,8-SEP-81,1500,0.0,30


In [17]:
li = [[data[j][i] for j in data] for i, com in enumerate(data['commission']) if com == 'NULL']
li

[['7369', 'SMITH', 'CLERK', '7902', '17-DEC-80', '800', 'NULL', '20'],
 ['7566', 'JONES', 'MANAGER', '7839', '2-APR-81', '2975', 'NULL', '20'],
 ['7698', 'BLAKE', 'MANAGER', '7839', '1-MAY-81', '2850', 'NULL', '30'],
 ['7782', 'CLARK', 'MANAGER', '7839', '9-JUN-81', '2450', 'NULL', '10'],
 ['7788', 'SCOTT', 'ANALYST', '7566', '09-DEC-82', '3000', 'NULL', '20'],
 ['7839', 'KING-WEB', 'PRESIDENT', 'NULL', '17-NOV-81', '15000', 'NULL', '10'],
 ['7876', 'ADAMS', 'CLERK', '7788', '12-JAN-83', '1100', 'NULL', '20'],
 ['7900', 'JAMES', 'CLERK', '7698', '3-DEC-81', '950', 'NULL', '30'],
 ['7902', 'FORD', 'ANALYST', '7566', '3-DEC-81', '3000', 'NULL', '20'],
 ['7934', 'MILLER', 'CLERK', '7782', '23-JAN-82', '1300', 'NULL', '10']]

In [18]:
li2 = [[data[j][i] for j in data] for i, dno in enumerate(data['deptno']) if (dno == '10') or (dno == '20')]
li2

[['7369', 'SMITH', 'CLERK', '7902', '17-DEC-80', '800', 'NULL', '20'],
 ['7566', 'JONES', 'MANAGER', '7839', '2-APR-81', '2975', 'NULL', '20'],
 ['7782', 'CLARK', 'MANAGER', '7839', '9-JUN-81', '2450', 'NULL', '10'],
 ['7788', 'SCOTT', 'ANALYST', '7566', '09-DEC-82', '3000', 'NULL', '20'],
 ['7839', 'KING-WEB', 'PRESIDENT', 'NULL', '17-NOV-81', '15000', 'NULL', '10'],
 ['7876', 'ADAMS', 'CLERK', '7788', '12-JAN-83', '1100', 'NULL', '20'],
 ['7902', 'FORD', 'ANALYST', '7566', '3-DEC-81', '3000', 'NULL', '20'],
 ['7934', 'MILLER', 'CLERK', '7782', '23-JAN-82', '1300', 'NULL', '10']]

$$
\sigma = \sqrt{\frac{\Sigma(X - \mu)^2}{n}}
$$

where,  
$\sigma$ = population of standard deviation  
$\Sigma$ = sum of...  
$\mu$ = population mean  
n = number of scores in sample

In [19]:
from urllib import request
import json
from IPython.display import Image

url = "https://restcountries.eu/rest/v2/all"
resp = request.urlopen(url)
script = json.load(resp)

Indian_flag = [i['flag'] for i in script if i['name'] == 'India'][0]
Image(url=Indian_flag)

In [20]:
Indian_flag

'https://restcountries.eu/data/ind.svg'