# DCPP Project : Indian Artists [Group 26]  Phase 1

In [509]:
import pandas as pd
 
# initialise data of lists.
team = {'Name':['Karan Saket', 'Pooja Ayyalasomayajula', 'Vikram Tantri', 'Shashank Jha'],
        'PGID':[12110063, 12110099, 12110114, 12110023]}
team_df = pd.DataFrame(team)
team_df

Unnamed: 0,Name,PGID
0,Karan Saket,12110063
1,Pooja Ayyalasomayajula,12110099
2,Vikram Tantri,12110114
3,Shashank Jha,12110023


### Step 1: Getting data from the site 

###### For the targeted topic of Indian Artist it is observed that a listed data for the different artists are obtainable in the below wiki site, however its data attributes are to be obtained in sub links tagged to the main wiki site. Phase 1 covers the aspect on obtaining the first sequence of wiki list

In [501]:
import requests

#Assigning the site url
# site_url = 'https://commons.wikimedia.org/wiki/Category:Art_of_India'
site_url = 'https://en.wikipedia.org/wiki/List_of_Indian_artists'


#Checking if the site is receiving a response
response = requests.get(site_url)
response.status_code

200

In [502]:
#Loading page contents into a text format
page_contents = response.text
page_contents[:1000]

'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>List of Indian artists - Wikipedia</title>\n<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"2ca52fc0-c2b1-4791-b075-c9edf698b374","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_Indian_artists","wgTitle":"List of Indian artists","wgCurRevisionId":1051065619,"wgRevisionId":1051065619,"wgArticleId":4457757,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Lists of Indian artists"],"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgRelevantPageName":"List_of_Indian_artists","

### Step 2: Beautiful soup parsing and data collection

In [503]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import time

#function to get the text data from the site 
def getTextData(url):
    response = requests.get(url)
    pageContent = response.text
    doc = BeautifulSoup(pageContent, 'html.parser')
#     return doc
    if doc.find(id="mw-content-text"): 
        catergoryDoc = doc.find(id="mw-content-text")
        return catergoryDoc
    else:
        return    
    
#function to get title, URL and description data from a given URL
def getData(doc):
    if doc:
        localNames = []
        localUrl = []
        localDesc = []
        allLinks = doc.find_all('li')                    #Finding all the lists in the page (which has our headers) 
        for li in allLinks:                                      
            a1 = li.find('a')                            #Finding all the anchor in the list which has our attributes for title, url and description
            if 'href' in a1.attrs:                       #if there is an active URL then going into the loop
                url = a1.get('href')                     #Getting the URL
                localUrl.append('https://en.wikipedia.org'+url)  #appending the URL to the local url list
                name = a1.get('title')                   #getting the name 
                localNames.append(name)                  #appending the name
                desc = li.text                           #getting the description
                localDesc.append(desc)                   #appending the description
#                 print(url)
#                 print(name)
#                 print(desc)
        localDict = {'localNames': localNames, 'localUrl':localUrl, 'localDesc':localDesc}
        return localDict
    else:
        return

#function to make a dataframe
def makedf(dict):
    df = pd.DataFrame(dict)
    return df    

#Getting the site id 
site_id = "https://en.wikipedia.org/wiki/List_of_Indian_artists"
site_id

'https://en.wikipedia.org/wiki/List_of_Indian_artists'

### Step 3: Making dataframe of the collected data

In [504]:
#Step1: using getTextData to extract text info from the site
#Step2: getting the name, URL and desciption
#Step3: making it a dataframe for proper data cleaning
start_time = time.time()

df = makedf(getData(getTextData(site_id)))

end_time = time.time()
total_time = round(end_time-start_time,2)


df
# print(str(total_time)+" seconds elapsed fetching this data")

Unnamed: 0,localNames,localUrl,localDesc
0,Amrita Sher-Gil,https://en.wikipedia.org/wiki/Amrita_Sher-Gil,Amrita Sher-Gil (1913–1941)
1,Jamini Roy,https://en.wikipedia.org/wiki/Jamini_Roy,Jamini Roy (1887–1972)
2,Kalipada Ghoshal,https://en.wikipedia.org/wiki/Kalipada_Ghoshal,Kalipada Ghoshal (1906–1995)
3,Maniam,https://en.wikipedia.org/wiki/Maniam,Maniam (1924–1968)
4,Manishi Dey,https://en.wikipedia.org/wiki/Manishi_Dey,Manishi Dey (1909–1966)
...,...,...,...
198,List of artists from Ukraine,https://en.wikipedia.org/wiki/List_of_artists_...,Ukraine
199,List of artists from the United Kingdom,https://en.wikipedia.org/wiki/List_of_artists_...,United Kingdom
200,List of artists from the United States,https://en.wikipedia.org/wiki/List_of_artists_...,United States
201,List of artists from Uruguay,https://en.wikipedia.org/wiki/List_of_artists_...,Uruguay


### Step 4: Data Cleansing to get the target data

In [505]:
#Cleaning of data
df2 =df.replace(to_replace='None', value=np.nan).dropna()       #Removing blank strings
df2 = df2[~df2.localNames.str.contains("List of")]              #removing all irrelevent rows containing 'List of'
df2 = df2[~df2.localNames.str.contains("Template")]             #remove all irrelevant rows containing 'template'
df2

Unnamed: 0,localNames,localUrl,localDesc
0,Amrita Sher-Gil,https://en.wikipedia.org/wiki/Amrita_Sher-Gil,Amrita Sher-Gil (1913–1941)
1,Jamini Roy,https://en.wikipedia.org/wiki/Jamini_Roy,Jamini Roy (1887–1972)
2,Kalipada Ghoshal,https://en.wikipedia.org/wiki/Kalipada_Ghoshal,Kalipada Ghoshal (1906–1995)
3,Maniam,https://en.wikipedia.org/wiki/Maniam,Maniam (1924–1968)
4,Manishi Dey,https://en.wikipedia.org/wiki/Manishi_Dey,Manishi Dey (1909–1966)
...,...,...,...
92,Vasudeo S. Gaitonde,https://en.wikipedia.org/wiki/Vasudeo_S._Gaitonde,"Vasudeo S. Gaitonde (1924–2001), painter"
93,Vivan Sundaram,https://en.wikipedia.org/wiki/Vivan_Sundaram,"Vivan Sundaram (born 1943), installation artist"
94,Yantr,https://en.wikipedia.org/wiki/Yantr,Yantr
95,Sudarsan Pattnaik,https://en.wikipedia.org/wiki/Sudarsan_Pattnaik,Sudarsan Pattnaik (born 1977) Sand Artist


In [506]:
#Extracting year information and description from the localDesc Column
df3 = df2
df3['localDesc2']= df2['localDesc'].str.split(pat='(', expand=True)[1]
df3['Year']= df2['localDesc2'].str.split(pat=')', expand=True)[0]
df3['localDesc3'] = df3['localDesc2'].str.split(pat=')', expand=True)[1]
df3['ArtistOccupation'] = df3['localDesc3'].str[1:]
df3['YearBorn']= df2['Year'].str.split('–|-|,', expand=True)[0]  #splitting data to get two different columns
df3['YearDied']= df2['Year'].str.split('–|-|,', expand=True)[1] #splitting data to get two different columns
df3['YearBorn'] =df3['YearBorn'].str.replace('born', '')  #cleaning data for all values which has stray words like 'born' in the int category
df3['YearBorn'] =df3['YearBorn'].str.replace('Born', '')
df3['YearDied'] =df3['YearDied'].str.replace('born', '')
df3

Unnamed: 0,localNames,localUrl,localDesc,localDesc2,Year,localDesc3,ArtistOccupation,YearBorn,YearDied
0,Amrita Sher-Gil,https://en.wikipedia.org/wiki/Amrita_Sher-Gil,Amrita Sher-Gil (1913–1941),1913–1941),1913–1941,,,1913,1941
1,Jamini Roy,https://en.wikipedia.org/wiki/Jamini_Roy,Jamini Roy (1887–1972),1887–1972),1887–1972,,,1887,1972
2,Kalipada Ghoshal,https://en.wikipedia.org/wiki/Kalipada_Ghoshal,Kalipada Ghoshal (1906–1995),1906–1995),1906–1995,,,1906,1995
3,Maniam,https://en.wikipedia.org/wiki/Maniam,Maniam (1924–1968),1924–1968),1924–1968,,,1924,1968
4,Manishi Dey,https://en.wikipedia.org/wiki/Manishi_Dey,Manishi Dey (1909–1966),1909–1966),1909–1966,,,1909,1966
...,...,...,...,...,...,...,...,...,...
92,Vasudeo S. Gaitonde,https://en.wikipedia.org/wiki/Vasudeo_S._Gaitonde,"Vasudeo S. Gaitonde (1924–2001), painter","1924–2001), painter",1924–2001,", painter",painter,1924,2001
93,Vivan Sundaram,https://en.wikipedia.org/wiki/Vivan_Sundaram,"Vivan Sundaram (born 1943), installation artist","born 1943), installation artist",born 1943,", installation artist",installation artist,1943,
94,Yantr,https://en.wikipedia.org/wiki/Yantr,Yantr,,,,,,
95,Sudarsan Pattnaik,https://en.wikipedia.org/wiki/Sudarsan_Pattnaik,Sudarsan Pattnaik (born 1977) Sand Artist,born 1977) Sand Artist,born 1977,Sand Artist,Sand Artist,1977,


In [507]:
#Further cleaning data for final columns alone
df4=df3
df4 =df4.drop(['localDesc','localDesc2','Year','localDesc3'],axis=1)
df4= df4.rename(columns={'localNames':'ArtistName','localUrl':'ArtistWikiPage'})
df4

Unnamed: 0,ArtistName,ArtistWikiPage,ArtistOccupation,YearBorn,YearDied
0,Amrita Sher-Gil,https://en.wikipedia.org/wiki/Amrita_Sher-Gil,,1913,1941
1,Jamini Roy,https://en.wikipedia.org/wiki/Jamini_Roy,,1887,1972
2,Kalipada Ghoshal,https://en.wikipedia.org/wiki/Kalipada_Ghoshal,,1906,1995
3,Maniam,https://en.wikipedia.org/wiki/Maniam,,1924,1968
4,Manishi Dey,https://en.wikipedia.org/wiki/Manishi_Dey,,1909,1966
...,...,...,...,...,...
92,Vasudeo S. Gaitonde,https://en.wikipedia.org/wiki/Vasudeo_S._Gaitonde,painter,1924,2001
93,Vivan Sundaram,https://en.wikipedia.org/wiki/Vivan_Sundaram,installation artist,1943,
94,Yantr,https://en.wikipedia.org/wiki/Yantr,,,
95,Sudarsan Pattnaik,https://en.wikipedia.org/wiki/Sudarsan_Pattnaik,Sand Artist,1977,


In [500]:
#Extracting to csv 
df4.to_csv('final_list_of_artists.csv',index=False)