In [1]:
import time
from datetime import datetime
import requests
from lxml import html
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd

## Load current ideas

In [2]:
#assumes excel file is already created. For project owners only, rest of notebook will not run otherwise. 
#contact J.Rosen.1392@gmail.com
current_ideas = pd.read_excel('../assets/FULL DB VALUES.xlsx')
current_ideas = current_ideas['Idea URL'].tolist()
current_ideas = [idea.split('.com',1)[1] for idea in current_ideas]

## Authentication

In [3]:
import credentials
user = credentials.user
passw = credentials.passw

login_url = "https://valueinvestorsclub.com/login"

#persist login through all requests
session_requests = requests.session()

#get login csrf token
result = session_requests.get(login_url)
tree = html.fromstring(result.text)
authenticity_token = list(set(tree.xpath("//input[@name='_token']/@value")))[0]
    
#create payload
payload = {
    "login[login_name]": user, 
    "login[password]": passw, 
    "_token": authenticity_token,
    "commit": "Login"
    }

#perform login
result = session_requests.post(login_url, data = payload, headers = dict(referer = login_url))

## Scrape functions

In [4]:
def make_soup(url):
    html = session_requests.get(url, headers = dict(referer = url)).text
    return BeautifulSoup(html,'lxml')

def get_ideas(url):
    '''pulls href text from each "full idea page"'''
    soup = make_soup(url)
    idea_links = [link.a['href'] for link in soup('span','vich1')]
    return idea_links

def get_small_info(url):
    '''pulls individual write-up features'''
    soup = make_soup(url)
    
    idea_name = soup.find('span','idea_company_name').text
    try:
        ticker = soup.find('span',{'style':'color:#ccc;'}).text
    except:
        ticker = np.nan
    member_link = soup.find('a', {'class':'display_name'})['href']
    member_name = soup.find('a', {'class':'display_name'})['title']
    try:
        quality = float(soup.find('span', {'id':'ratings_q'})['data-rateit-value'])
    except:
        quality = np.nan
    try:
        performance = float(soup.find('span', {'id':'ratings_p'})['data-rateit-value'])
    except:
        performance = np.nan
    description_raw = soup.find('div',{'id':'description'}).text
    date_text = soup.find('div', {'style':'display:inline-block;'}).text
    submission_date = datetime.strptime(date_text.split(' by')[0].replace(',','').replace(' ',''),'%B%d%Y')
    try:
        if soup.find('span', {'class':'label label-short'}).string == "S":
            is_long = False
    except Exception as e:
        is_long = True
    
    return pd.DataFrame({'Idea': [idea_name],
            'Idea URL': [url],
            'Ticker': [ticker],
            'Author': [member_name],
            'Author link': [member_link],
            'Submission Date': [submission_date],
            'Quality': [quality],
            'Performance': [performance],
            'Description': [description_raw],
            'Long': [is_long]})

## Main

In [5]:
base_url = 'https://valueinvestorsclub.com'
add_ons = ['0-9','A-C','D-F','G-J','K-N','O-R','S-V','W-Z']    

#compile A-Z into one list
idea_pages = []
for add_on in add_ons:
    time.sleep(1)
    idea_pages.extend(get_ideas(base_url+'/ideas/atoz/'+add_on))

In [6]:
#grab each idea's features and compile into dataframe
data_frames = []
for idea in idea_pages:
    try:
        if idea not in current_ideas and idea != '/idea/Celebrate_Express/2693':
            print idea
            time.sleep(1.3) #this may take a while, but always be nice
            data_frames.append(get_small_info(base_url+idea))
            df = pd.concat(data_frames, ignore_index = True)
        else:
            pass
    except:
        pass

/idea/FERROGLOBE_PLC/139399
/idea/MELLANOX_TECHNOLOGIES_LTD/139398
/idea/Sirius_Real_Estate/139396


In [7]:
#write to excel file for future steps
df.to_excel('../assets/Site Scrape '+datetime.strftime(datetime.now(),'%Y%m%d')+'.xlsx', sheet_name='Sheet1', engine='xlsxwriter')