# Scraping AJAX 
A practice for scraping info from web https://corp.sos.ms.gov/corp/portal/c/page/corpBusinessIdSearch/portal.aspx?#clear=1

## AJAX request
1. Open the page above on Chrome. Then open View -> Developer -> Developer Tools.
2. In developer tools, click Network.
3. Search something pretty simple on original page (for example: "a").
4. In developer tools, we will see a file called "BusinessNameSearch", click this file.
5. In General section, there is Request URL. This is the url we will used later in python requests.
6. In Request Payload, we will see the request format and which we will use python requests to simulate.

## Simple Version for Diagnostic
This is the simpliest code only to see if we can get some reponse or not.

In [9]:
import requests
import json

temp = "a"
r = requests.post(
    url='https://corp.sos.ms.gov/corp/Services/MS/CorpServices.asmx/BusinessNameSearch',
    json = { 
        'BusinessName': temp,
        'SearchType': "startingwith"}
    
)

If everything works well, we can type 
```python
r.json()
```
and see the list of rearching results.

Then, we convert result from str to a pd.dataframe.

In [10]:
import ast # eval function
import pandas as pd

res_str = r.json()["d"]

res = ast.literal_eval(res_str[11:-1])  # list of dict

df = pd.DataFrame(res)

## Recursive on starting letter A-Z

In [11]:
import requests
import json
import ast
import pandas as pd

output = pd.DataFrame(columns = ['BusinessFormedDate', 'BusinessId', 'BusinessName', 'EntityId',
       'FilingId', 'FilingStatus', 'FilingTypeId', 'FilingtypeName',
       'NameType'])

for i in range(ord('a'), ord('z')+1):
    temp = chr(i)

    r = requests.post(
        url='https://corp.sos.ms.gov/corp/Services/MS/CorpServices.asmx/BusinessNameSearch',
        json = { 
            'BusinessName': temp,
            'SearchType': "startingwith"}

    )
    
    res_str = r.json()["d"]
    res = ast.literal_eval(res_str[11:-1])  # list of dict
    df = pd.DataFrame(res)
    output = output.append(df, ignore_index=True)

output.to_csv("MS_Business_info.csv", index=False)

## references
http://toddhayton.com/2015/03/11/scraping-ajax-pages-with-python/

## Issues
1. Web do not support regular expression nor space. Need to type some letters for searching.
2. Every time it only response 2000 results.
3. No details for the searching results.
4. Replicate searching results from used name and current name
5. Agent name inconsistent format, "first last" or "last, first"
6. address format inconsistent

## Recursive
A recursive function that searching on all strings with prefix given. 
If the result row number is greater than 2000, the current strings are treated as prefix for following the searching.

In [12]:
import requests
import json
import ast
import pandas as pd

class scrap:
    def __init__(self):
        self.output = pd.DataFrame(columns = ['BusinessFormedDate', 'BusinessId', 'BusinessName', 'EntityId',
                                              'FilingId', 'FilingStatus', 'FilingTypeId', 'FilingtypeName',
                                              'NameType'])
        
        self.letters = list(map(chr, range(ord("a"), ord("z")+1))) + list(map(str,range(10)))
        
    def foo(self,pre):
        for letter in self.letters:
            inpt = pre+letter
            print(inpt)
            r = requests.post(
                url='https://corp.sos.ms.gov/corp/Services/MS/CorpServices.asmx/BusinessNameSearch',
                json = { 
                    'BusinessName': inpt,
                    'SearchType': "startingwith"}
            )
            res_str = r.json()["d"]
            if res_str != '""':
                res = ast.literal_eval(res_str[11:-1])
                df = pd.DataFrame(res)
                if df.shape[0] == 2000:
                    self.foo(inpt)
                else:
                    self.output = self.output.append(df)
    
    def res(self):
        return(self.output)

### Example

In [13]:
a = scrap()
a.foo("ama")
df = a.res()
df.shape

amaa
amab
amac
amad
amae
amaf
amag
amah
amai
amaj
amak
amal
amam
aman
amao
amap
amaq
amar
amas
amat
amau
amav
amaw
amax
amay
amaz
ama0
ama1
ama2
ama3
ama4
ama5
ama6
ama7
ama8
ama9


(344, 9)

# ASPX Scraping

In [17]:
import requests
import json
import ast
import pandas as pd
from bs4 import BeautifulSoup

In [18]:
def table_dt(bs_ele):
    info = bs_ele.contents
    res = []
    i = 0
    if str(info[i]) == "<br/>":
        # name missing
        res.append("")
    else:
        temp = info[i].text.split()
        res.append(" ".join(temp))
        i += 1
    i += 1
    if str(info[i]) == "<br/>":
        # street suite missing
        res.extend(['']*2)
    else:
        temp = info[i].split(",")
        if len(temp)==1:
            temp.append("")
        elif len(temp)>=3:
            temp = [str(info[2]),""]
            temp = [i.strip() for i in temp]
        res.extend(temp)
        i += 1
    i += 1
    if str(info[i]) == "<br/>":
        # city, state and zip missing
        res.extend(['']*3)
    else:
        temp = info[i].replace(",","").split()
        if len(temp)>3:
            temp[:-2] = [" ".join(temp[:-2])]
        res.extend(temp)
    return(res)

In [33]:
Name_History = []
Business_Information = []
Registered_Agent = []
Officers_Directors = []

seen = set()
error = set()
for (ind,row) in df.iterrows():
# for (ind,row) in df.iloc[:50,:].iterrows():
    Id = row[1]
    print(Id)
    if Id in seen: continue
    seen.add(Id)
    address = ('https://corp.sos.ms.gov/corp/portal/c/page/'
               'corpbusinessidsearch/~/ViewXSLTFileByName.aspx?'
               'providerName=MSBSD_CorporationBusinessDetails&FilingId=')+ row[4] +'&_='

    page = requests.get(address)
    soup = BeautifulSoup(page.text, 'html.parser')
    tables = [i for i in soup.body][3]
    table = [i for i in tables]
    n = len(table)
    
    '''
    table[1] is the title of the first subtable
    table[3] is the body of the first sbutable

    table[5] is the title of the second subtable
    table[7] is the body of the second sbutable

    table[9] is the title of the third subtable
    table[11] is the body of the third subtable

    table[13] is the title of the fourth subtable
    table[15] is the body of the fourth subtable
    '''
    
    if n>3:
        # table[3] is Name History    
        tb3 = [i.text for i in table[3].find_all("td")]
        n3 = len(tb3)
    else:
        n3 = 0
    
    if n>7:
        # table[7] is Business Information
        tb7 = [i.text for i in table[7].findAll("td")]
        n7 = len(tb7)
    else:
        n7 = 0
    
    if n>11:
        # table[11] is Registered Agent
        tb11 = [i for i in table[11].findAll("td")]
        n11 = len(tb11)
    else:
        n11 = 0
    
    if n>15:
        # table[15] is Officers & Directors
        tb15 = [i for i in table[15].findAll("td")]
        n15 = len(tb15)
    else:
        n15 = 0

    if n3%3!=0 or n7%6!=0 or n11%2!=0 or n15%3!=0:
        error.add(Id)
        continue
    
    if n3==0:
        Name_History.extend([[Id,"",""]])
    else:
        tb3 = [[tb3[i],tb3[i+2]] for i in range(3,n3,3)]
        tb3 = [[Id]+i for i in tb3]
        Name_History.extend(tb3)
        
    if n7==0:
        Business_Information.append(['',Id] + ['']*4)
    else:
        tb7 = [tb7[i] for i in range(1,n7,2)]
        tb7[-1] = ' '.join(tb7[-1].split())
        Business_Information.append(tb7)
    
    if n11==0:
        Registered_Agent.extend([[Id]+[""]*6])
    else:
        text11 = list(map(table_dt, [tb11[j] for j in range(1,n11)]))
        text11 = [[Id]+i for i in text11]
        Registered_Agent.extend(text11)
    
    if n15==0:
        Officers_Directors.extend([[Id]+[""]*7])
    else:
        text15 = list(map(table_dt, [tb15[j] for j in range(3,n15,3)]))
        title15 = [[tb15[j].text.replace("\xa0","")] for j in range(5,n15,3)]
        text15 = [[Id]+i+j for (i,j) in zip(text15,title15)]
        Officers_Directors.extend(text15)
    print("succ")
    
out1 = pd.DataFrame(Name_History,
                    columns=["Business_ID","Name","Name_Type"])
out2 = pd.DataFrame(Business_Information,
                    columns=["Business_Type","Business_ID","Status","Effective_Date",
                             "State_of_Incorporation", "Principal_Office_Address"])
out3 = pd.DataFrame(Registered_Agent,
                    columns=["Business_ID","Name","Street","Apt/Suite","City","State","Zip_Code"])
out4 = pd.DataFrame(Officers_Directors, 
                    columns=["Business_ID", "Name","Street","Apt/Suite","City","State","Zip_Code","Title"])

# out1.to_scv('file path')

1128825
succ
870295
succ
968740
succ
619958
succ
734087
succ
864920
succ
1038751
succ
1135606
succ
646889
succ
1180103
succ
433489
succ
569622
succ
996941
succ
1024977
succ
1023866
succ
530731
succ
584323
succ
1146222
succ
526262
succ
909586
succ
721431
succ
899418
succ
721431
1025627
succ
657178
succ
706109
succ
891887
succ
998819
succ
998821
succ
1139438
succ
1166614
succ
920901
succ
1120559
succ
994554
succ
1117337
succ
980466
succ
1017358
succ
600894
succ
1012927
succ
1002867
succ
944175
succ
1016370
succ
1166120
succ
1062935
succ
1159568
succ
1028404
succ
553937
succ
964361
succ
433490
succ
307388
succ
885765
succ
968685
succ
985861
succ
998822
succ
679700
succ
993145
succ
928395
succ
969229
succ
920961
succ
920961
1166564
succ
641078
succ
1178619
succ
920961
915227
succ
957385
succ
618105
succ
1144351
succ
954544
succ
1081065
succ
1055819
succ
954159
succ
1006259
succ
1038446
succ
993602
succ
963341
succ
1146154
succ
1144694
succ
1189521
succ
1192511
succ
1061645
succ
638598
succ