In [1]:
#importing all necessary libraries

from bs4 import BeautifulSoup
import requests
from math import ceil
import time
import csv
import pandas as pd
import os

#=======================================================================================================

# array of internshala links
links_list = ['https://internshala.com/internships/work-from-home-jobs', \
            'https://internshala.com/internships/internship-in-bangalore', \
            'https://internshala.com/internships/internship-in-hyderabad', \
            'https://internshala.com/internships/internship-in-odisha']

#=======================================================================================================


def scrape_main(link):
    '''
     a function that request the webpage and store it in response object.

     then passing lxml parser to parse over the webpage.

     here lxml parser defines the speed to parse the webpage.

     if it throws any errors in using lxml parser. just install it by: " pip install lxml "
    '''

    response = requests.get(link)
    return (BeautifulSoup(response.text, 'lxml'))

#=================================================================================================================

# timestamp as file name by using time library and with prefix as internshala
file_name_1 = 'internshala_first_data_' + time.strftime("%d_%m_%Y_%H_%M_%S") + ".csv"


#============================================================================================================

def get_links_internshala(links = links_list):
    '''
        Function to extract unique links of job postings in internshala.
    '''

    # generalied locations based on links and its position
    loc_links = ['work_from_home', 'bangalore', 'hyderabad', 'odisha']

    # writing row heading to understand each column
    row_heading = ['source', 'location', 'job_link']

    # opening file in write mode and connecting csv writer to file
    file = open(file_name_1, 'w')
    writer = csv.writer(file)

    # initially writing header of csv file
    writer.writerow(row_heading)

    # looping over array of links with index value
    for index, url in enumerate(links):
        # passing main page to scape
        soup = scrape_main(url)

        #  to find number of job pages to scrape we need to get count of jobs available which is at heading in webpage.
        # print(soup.find('div',{'class':'heading heading_4_6'}))

        # based on count of jobs - finding the number of pages available at one link in array of links.
        pages = ceil(int(soup.find('div', {'class': 'heading heading_4_6'}).text.split()[0]) / 40)
        for page in range(pages):
            # now we need to scrape over pages under main url
            base_url = url + "/page-" + str(page)
            soup1 = scrape_main(base_url)

            # firstly finding each single job in each page to find job link
            for single_job in soup.find_all("div", {"class": "individual_internship"}):

                if (single_job.find('div', {'class': 'heading_4_5 profile'}) == None):
                    continue

                job_link = "https://internshala.com"
                job_link += single_job.find('div', {'class': 'heading_4_5 profile'}).a.get('href')

                source = 'internshala'

                location = loc_links[index]

                # writing all details to csv
                writer.writerow([source, location, job_link])
                break
            break

    # closing csv file
    file.close()

    # reading csv file
    df = pd.read_csv(file_name_1)

    # df.tail(5)

    # size of jobs collected
    # df.shape

    # removing extracted csv file
    os.remove(file_name_1)

    # storing to csv file
    df.to_csv(file_name_1, index=False)

#=================================================================================================================

# timestamp as file name by using time library and with prefix as internshala
file_name_2 = 'internshala_second_data_' + time.strftime("%d_%m_%Y_%H_%M_%S") + ".csv"


#===============================================================================================================

def get_complete_info_internshala(file_name = file_name_1):
    '''
        A function in internshala scraping to extract all the information about each job posting based on link.

    '''
    # reading first csv file which contains the links
    df = pd.read_csv(file_name)

    # checking 5 job records
    #df.head(5)

    # writing row heading to understand each column
    row_heading = ['source', 'location', 'job_link', 'job_title', \
                   'company_name', 'imp_fields', 'description_headings', 'description']


    # opening file in write mode and connecting csv writer to file
    file = open(file_name_2, 'w')
    writer = csv.writer(file)

    # initially writing header of csv file
    writer.writerow(row_heading)

    # looping over dataframe link column with index value to give job location and source to new data frame
    for index, link in enumerate(df.job_link):
        # passing  url to scrape on each job link
        soup = scrape_main(link)

        # getting job title by finding unique class name in webpage
        if soup.find('div', {'class': 'heading_4_5 profile'}) == None:
            continue

        job_title = soup.find('div', {'class': 'heading_4_5 profile'}).text.strip()

        # getting company name by unique class name of div tag
        company_name = soup.find('div', {'class': 'heading_6 company_name'}).text.strip()

        '''
        # getting important fields in the job posting as list
        fields are:
        1. start date of joining/mode of vacancy
        2. duration of job
        3. incentives/stipend
        4. last date to apply
        5. types of doing job 
        '''

        imp_fields = []
        for i in soup.find_all('div', {'class': 'item_body'}):
            imp_fields.append(i.get_text().strip())

        # this list is extracting for further process to do in description to get valuable information.
        description_headings = []
        for i in soup.find_all('div', {'class': 'section_heading heading_5_5'}):
            description_headings.append(i.get_text().strip())

        # complete description of job
        description = soup.find('div', {'class': 'internship_details'}).get_text().strip()

        # writing to the server
        writer.writerow([df.source[index], df.location[index], df.job_link[index], \
                         job_title, company_name, imp_fields, description_headings, description])

    # closing file object
    file.close()

    # loading extracted csv file to the dataframe
    df2 = pd.read_csv(file_name_2)

    #df2.sample(5)

    # getting size of csv file
    #df2.shape

    #removing extracted csv file
    os.remove(file_name_2)

    #### To save it into normal csv file without spaces
    df2.to_csv(file_name_2, index = False)


#=================================================================================================================

def to_database_format_internshala():
    '''
        To save file as pipe '|' as delimiter
    '''
    print("\tHere the csv file stored in pipe as delimiter format " \
            "\n\t file names are given in time format way as: " \
          "\n\t'internshala_first/second_database_ + time.strftime(%d_%m_%Y_%H_%M_%S)'")

    # reading first csv file
    df_1 = pd.read_csv(file_name_1)

    # given time stamp file
    file_db_1 = 'internshala_first_database_' + time.strftime("%d_%m_%Y_%H_%M_%S") + '.csv'

    # saving file as db format
    df_1.to_csv(file_db_1, sep='|', index = False)

    # reading second csv file
    df_2 = pd.read_csv(file_name_2)

    # given time stamp file
    file_db_2 = 'internshala_second_database_' + time.strftime("%d_%m_%Y_%H_%M_%S") + '.csv'

    # saving file as db format
    df_2.to_csv(file_db_2, sep='|', index = False)



#=============================================================================================================

In [2]:
get_links_internshala()

In [3]:
get_complete_info_internshala()

In [4]:
to_database_format_internshala()

	Here the csv file stored in pipe as delimiter format 
	 file names are given in time format way as: 
	'internshala_first/second_database_ + time.strftime(%d_%m_%Y_%H_%M_%S)'


In [5]:
#importing all necessary libraries

import pandas as pd
import csv
from bs4 import BeautifulSoup
import requests
import time
import os


#========================================================================================================

# array of indeed links
links_list = ['https://www.indeed.co.in/jobs?q=&l=Telangana&radius=100&sort=date&start=', \
              'https://www.indeed.co.in/jobs?q=&l=Karnataka&radius=100&sort=date&start=', \
              'https://www.indeed.co.in/jobs?q=&l=Orissa&radius=100&sort=date&start=']


#=================================================================================================================

def url_soup(url):
    '''
      A function that request the webpage and store it in response object.

      Then passing lxml parser to parse over the webpage.

      Here lxml parser defines the speed to parse the webpage.

      If it throws any errors in using lxml parser. just install it by: " pip install lxml "
    '''
    response = requests.get(url)
    return (BeautifulSoup(response.text, 'lxml'))


#=================================================================================================================

# timestamp as file name by using time library and with prefix as indeed
file_name = "indeed_" + time.strftime("%d_%m_%Y_%H_%M_%S") + ".csv"


#=================================================================================================================

def scrape_indeed(list = links_list):
    '''
        A function in indeed to extract jobs in indeed using the array of links_list,

        where all job links are given based on location wise.
    '''

    # writing row heading to understand each column
    row_heading = ['Source', 'job_title', 'company_name', 'salary', 'location', 'short_summary', 'link']

    # opening file in write mode and connecting csv writer to file
    file = open(file_name, 'w')
    writer = csv.writer(file)

    # initially writing header of csv file
    writer.writerow(row_heading)

    # looping over array of links with index value
    for i in list:
        sp = url_soup(i)
        cont = sp.find("div", {"id": "searchCountPages"})
        jobs = cont.string.split()[3]
        jobs = jobs.replace(',', '')
        for page in range(0, 100, 10):
            container = sp.findAll("div", {"class": "jobsearch-SerpJobCard"})
            # print(len(container))

            for each_job in range(len(container)):
                source = "indeed"

                # getting single job posting work title
                job_title = container[each_job].find('a', {'class': 'jobtitle'}).string.strip()

                # getting single job company name
                comp_na = container[each_job].find('span', {'class': 'company'}).string
                if comp_na != None:
                    comp_name = comp_na.strip()
                else:
                    comp_name = None

                # getting single job salary
                sal = container[each_job].find('span', {'class': 'salaryText'})
                if sal != None:
                    salary = sal.string.strip()
                else:
                    salary = None

                # getting single job location
                job_lo = container[each_job].find('div', {'class': 'location'})
                if job_lo != None:
                    job_loc = job_lo.string.strip()
                else:
                    job_loc = None

                # getting single job summary
                job_short_summa = container[each_job].find('div', {'class': 'summary'}).li
                if job_short_summa != None:
                    job_short_summary = job_short_summa.string.strip()
                else:
                    job_short_summary = None

                # to get complete info
                # getting single job posting link
                link = 'https://www.indeed.co.in'
                link += container[each_job].a.get('href')
                job_link = link
                lis = [source, job_title, comp_name, salary, job_loc, job_short_summary, job_link]
                # for i in lis:
                # print(i)

                # writing all details to csv
                writer.writerow(lis)
                break
            break
    # closing csv file
    file.close()

    # reading csv file
    df = pd.read_csv(file_name)

    # df.tail(5)

    # size of jobs collected
    # df.shape

    # removing extracted csv file
    os.remove(file_name)

    # storing to csv file
    df.to_csv(file_name, index=False)


#=================================================================================================================

def to_database_format_indeed():
    '''
        To save file as pipe '|' as delimiter
    '''
    print("\tHere the csv file stored in pipe as delimiter format " \
            "\n\t file name is given in time format way as: " \
          "\n\t'indeed_database_ + time.strftime(%d_%m_%Y_%H_%M_%S)'")

    # reading first csv file
    df_1 = pd.read_csv(file_name)

    # given time stamp file
    file_db_1 = 'indeed_database_' + time.strftime("%d_%m_%Y_%H_%M_%S") + '.csv'

    # saving file as db format
    df_1.to_csv(file_db_1, sep='|', index = False)


# =================================================================================================================



In [6]:
scrape_indeed()

In [7]:
to_database_format_indeed()

	Here the csv file stored in pipe as delimiter format 
	 file name is given in time format way as: 
	'indeed_database_ + time.strftime(%d_%m_%Y_%H_%M_%S)'


In [8]:
#importing all necessary libraries

from facebook_scraper_lib import get_posts
import csv
import time
from bs4 import BeautifulSoup as soup
import requests
import pandas as pd
import os


#=================================================================================================================

def url_rq(link):
    '''
      A function that request the webpage and store it in response object.

      Then passing lxml parser to parse over the webpage.

      Here lxml parser defines the speed to parse the webpage.

      If it throws any errors in using lxml parser. just install it by: " pip install lxml "
    '''
    response = requests.get(link)
    sp = soup(response.text, 'lxml')
    return(sp.find('div',{'dir':'ltr'}))


#=================================================================================================================


# timestamp as file name by using time library and with prefix as facebook
file_name = "facebook_" + time.strftime("%d_%m_%Y_%H_%M_%S") + ".csv"

#=================================================================================================================

def scrape_facebook():
    '''
        A function to get the posts in facebook and based on links in facebook it going to rescrape on that link.

        here it works only on links of id : 380555718642309.

        since the id values of other websites of other id might be different so need to cross check.
    '''

    # writing row heading to understand each column
    row_heading = ['post_id', 'text', 'post_text', 'shared_text', 'time', \
                   'likes', 'comments', 'shares', 'link', 'jobs_info']

    # opening file in write mode and connecting csv writer to file
    with open(file_name, 'w') as file:
        writer = csv.writer(file)

        # initially writing header of csv file
        writer.writerow(row_heading)

        # extracting posts from facebook  by using facebook_scraper_lib
        for post in get_posts('380555718642309', pages=1):

            # for jobs_info the actual scraping is done to extract xml data of multiple jobs.
            jobs_info = url_rq(post['link'])

            # writing all details to csv
            writer.writerow([post['post_id'], post['text'], post['post_text'],\
                             post['shared_text'], post['time'], post['likes'], post['comments'],\
                             post['shares'], post['link'], jobs_info])


    df = pd.read_csv(file_name)
    # df.tail(5)

    # size of jobs collected
    # df.shape

    # removing extracted csv file
    os.remove(file_name)

    # storing to csv file
    df.to_csv(file_name, index=False)



#=================================================================================================================

def to_database_format_facebook():
    '''
        To save file as pipe '|' as delimiter
    '''
    print("\tHere the csv file stored in pipe as delimiter format " \
            "\n\t file name is given in time format way as: " \
          "\n\t'facebook_database_ + time.strftime(%d_%m_%Y_%H_%M_%S)'")

    # reading first csv file
    df_1 = pd.read_csv(file_name)

    # given time stamp file
    file_db_1 = 'facebook_database_' + time.strftime("%d_%m_%Y_%H_%M_%S") + '.csv'

    # saving file as db format
    df_1.to_csv(file_db_1, sep='|', index = False)


# =================================================================================================================




In [9]:
scrape_facebook()

In [10]:
to_database_format_facebook()

	Here the csv file stored in pipe as delimiter format 
	 file name is given in time format way as: 
	'facebook_database_ + time.strftime(%d_%m_%Y_%H_%M_%S)'
