### Interacting with Parse Hub 

In [9]:
import pandas as pd
import numpy as np  
from googlesearch import search
from random import randint
from time import sleep

### Load Data

In [510]:
df = pd.read_csv('Ontario_Pre.csv')

In [511]:
df.head()

Unnamed: 0,selection1_OEM,selection1_Dealer_Name_name,selection1_Dealer_Name_url,selection1_Dealer_Name_Address,selection1_Dealer_Name_City,selection1_Dealer_Name_City_url,selection1_Dealer_Name_State,selection1_Dealer_Name_State_url,selection1_Dealer_Name_ZipCode,selection1_Dealer_Name_ZipCode_url,selection1_Dealer_Name_Sales_Phone_Number,selection1_Dealer_Name_Number_DealerRaterReviews,selection1_Dealer_Name_DealerRaterRating,selection1_Dealer_Name_page
0,Chevrolet,Huron Motor Products,https://www.dealerrater.ca/dealer/Huron-Motor-...,,,,,,,,,,,
1,Chevrolet,Finch Chevrolet Cadillac Buick GMC,https://www.dealerrater.ca/dealer/Finch-Chevro...,,,,,,,,,,,
2,Chevrolet,Georgetown Chevrolet Buick GMC,https://www.dealerrater.ca/dealer/Georgetown-C...,,,,,,,,,,,
3,Chevrolet,Budds' Chevrolet Cadillac Buick GMC,https://www.dealerrater.ca/dealer/Budds-Chevro...,,,,,,,,,,,
4,Chevrolet,Wilson Niblett Motors,https://www.dealerrater.ca/dealer/Wilson-Nible...,,,,,,,,,,,


### Delete Duplicate Dealers 
Dealers created from scraping within ParseHub automatically don't have any details beside Dealer email and DealerRater URL. So locating Dealers without a zipcode and then deleting them, we delete duplicate Dealers

In [512]:
df.loc[df['selection1_Dealer_Name_ZipCode'].isnull(), 'selection1_Dealer_Name_ZipCode']='None'

In [513]:
df = df[df.selection1_Dealer_Name_ZipCode != 'None']

### Reset Index

So that we can later name the exported CSV as the State.csv

In [514]:
df.reset_index(drop=True, inplace=True)

### Rename and Reorganize Columns to main formatting

In [515]:
df = df.rename(columns={'selection1_OEM': 'Manufactuer', 'selection1_Dealer_Name_name': 'Dealership',
                        'selection1_Dealer_Name_url': 'DealerRater URL','selection1_Dealer_Name_Address': 'Address',
                       'selection1_Dealer_Name_City': 'City','selection1_Dealer_Name_State': 'State',
                       'selection1_Dealer_Name_ZipCode': 'Zip Code','selection1_Dealer_Name_Sales_Phone_Number': 'Phone Number',
                       'selection1_Dealer_Name_Number_DealerRaterReviews': 'DealerRater # of Reviews',
                        'selection1_Dealer_Name_DealerRaterRating': 'DealerRater Star Rating',
                       'selection1_Dealer_Name_page': 'Website'})

In [516]:
df['Customer'] = df.apply(lambda _: 'Customer', axis=1)
df['Google Rating'] = df.apply(lambda _: 'Google Rating', axis=1)
df['# of Google Reviews'] = df.apply(lambda _: '# of Google Reviews', axis=1)
df['Dealership Group'] = df.apply(lambda _: 'Dealership Group', axis=1)

In [517]:
df.columns

Index(['Manufactuer', 'Dealership', 'DealerRater URL', 'Address', 'City',
       'selection1_Dealer_Name_City_url', 'State',
       'selection1_Dealer_Name_State_url', 'Zip Code',
       'selection1_Dealer_Name_ZipCode_url', 'Phone Number',
       'DealerRater # of Reviews', 'DealerRater Star Rating', 'Website',
       'Customer', 'Google Rating', '# of Google Reviews', 'Dealership Group'],
      dtype='object')

In [518]:
df = df[['Manufactuer','Dealership','Address', 'City','State','Zip Code','Phone Number','Website',
      'Dealership Group','Customer', 'Google Rating','# of Google Reviews','DealerRater # of Reviews',
      'DealerRater Star Rating','DealerRater URL']]

In [519]:
cols = list(df.columns.values)

### Rename and Group all OEM's 

In [520]:
OEM_Groups = {
    'Ford': 'Ford Lincoln',
    'Lincoln': 'Ford Lincoln',
    'Dodge': 'FCA',
    'Chrysler': 'FCA',
    'Jeep': 'FCA',
    'Ram': 'FCA',
    'FIAT':'FIAT',
    'Alfa Romeo':'Alfa Romeo',
    'Buick': 'GM',
    'GMC':'GM',
    'Cadillac': 'GM',
    'Chevrolet': 'GM',
    'Toyota': 'Toyota',
    'Nissan': 'Nissan',
    'BMW': 'BMW',
    'Hyundai': 'Hyundai',
    'Kia': 'Kia',
    'Lexus': 'Lexus',
    'MINI': 'MINI',
    'Mazda': 'Mazda',
    'Mitsubishi': 'Mitsubishi',
    'Volvo': 'Volvo',
    'Honda': 'Honda',
    'Acura': 'Acura',
    'Audi': 'Audi',
    'Mercedes-Benz': 'Mercedes-Benz',
    'Subaru': 'Subaru',
    'Volkswagen': 'Volkswagen',
    'INFINITI': 'INFINITI',
    'Jaguar': 'Jaguar',
    'Porsche': 'Porsche',
    'Land Rover': 'Land Rover',
    'Maserati': 'Maserati',
    'Lamborghini': 'Lamborghini',
    'McLaren': 'McLaren',
    'Bentley': 'Bentley',
    'Aston Martin': 'Aston Martin',
    'Rolls Royce': 'Rolls Royce',
    'Recreational Vehicles': 'Recreational Vehicles',
    'Used Car Dealer': 'Used Car Dealer',
    'Ferrari': 'Ferrari',
    'smart': 'smart'
}

In [521]:
df['Manufactuer'] = df['Manufactuer'].map(OEM_Groups)

### Delete all FCA and OEM's already contained in list

Because we wanted to weed out any duplicate dealerships within the "Used Car" category, we chose to scrape all the dealerships that we already had on our list. However we already have their data, so we delete those rows

In [522]:
#df = df[df.Manufactuer != 'Ford Lincoln']
#df = df[df.Manufactuer != 'Toyota']
#df = df[df.Manufactuer != 'Nissan']
#df = df[df.Manufactuer != 'BMW']
#df = df[df.Manufactuer != 'FCA']
#df = df[df.Manufactuer != 'Hyundai']
#df = df[df.Manufactuer != 'Kia']
#df = df[df.Manufactuer != 'Lexus']
#df = df[df.Manufactuer != 'MINI']
#df = df[df.Manufactuer != 'Mazda']
#df = df[df.Manufactuer != 'Mitsubishi']

### Fill in missing Websites

In [523]:
df.loc[df['Website'].isnull(), 'Website']='None'

In [524]:
def Get_URL (platform):
    for url in search ("'" + platform + "'", stop=1):
        return(url)

In [525]:
#sleep(randint(0,3))

In [526]:
df['Website'] = np.where(df['Website'] == 'None', df['Dealership'].apply(Get_URL), df['Website'])

In [527]:
df.head()

Unnamed: 0,Manufactuer,Dealership,Address,City,State,Zip Code,Phone Number,Website,Dealership Group,Customer,Google Rating,# of Google Reviews,DealerRater # of Reviews,DealerRater Star Rating,DealerRater URL
0,GM,Roy Nichols Chevrolet,2728 Courtice Road,Courtice,Ontario,L1E 2M7,(905) 436-2222,https://www.roynicholsmotors.com/,Dealership Group,Customer,Google Rating,# of Google Reviews,1824 Lifetime Reviews,5.0,https://www.dealerrater.ca/dealer/Roy-Nichols-...
1,GM,Jeff Smith's County Chevrolet,224 Talbot St N,Essex,Ontario,N8M 2C8,(519) 776-4222,https://www.countychevroletessex.com/,Dealership Group,Customer,Google Rating,# of Google Reviews,6 Lifetime Reviews,3.4,https://www.dealerrater.ca/dealer/Jeff-Smith-s...
2,GM,The Humberview Group,3200 Bloor Street West,Toronto,Ontario,M8X 1E1,(877) 893-0990,https://www.humberviewgroup.com/,Dealership Group,Customer,Google Rating,# of Google Reviews,1824 Lifetime Reviews,1.8,https://www.dealerrater.ca/dealer/The-Humbervi...
3,GM,Upper Canada Motor Sales Limited,12375 Country Rd 2,Morrisburg,Ontario,K0C 1X0,(613) 543-2925,https://www.uppercanadamotors.com/,Dealership Group,Customer,Google Rating,# of Google Reviews,1 Lifetime Review,1.0,https://www.dealerrater.ca/dealer/Upper-Canada...
4,GM,Bill Spencer Chevrolet Ltd,Hwy #2 West,Cobourg,Ontario,K9A 4S3,(905) 372-8773,https://www.billspencerchevrolet.com/,Dealership Group,Customer,Google Rating,# of Google Reviews,1824 Lifetime Reviews,,https://www.dealerrater.ca/dealer/Bill-Spencer...


### Clear Values within Customer, Dealership Group, Google Rating and # of Google Reviews Columns

In [528]:
df[cols] = df[cols].replace({'Customer':np.nan, 'Google Rating':np.nan,
                               '# of Google Reviews':np.nan,'Dealership Group':np.nan})

### Name the CSV before we change to Abbreviations

In [529]:
State =df['State']

In [530]:
CSV_name = State[0]

### Replace States with Abbreviations

In [306]:
us_state_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY',
    'Dist. of Columbia':'DC',
    'Newfoundland and Labrador':'NL',
    'Prince Edward Island':'PE',
    'Nova Scotia':'NS',
    'New Brunswick':'NB',
    'Quebec':'QC',
    'Ontario':'ON',
    'Manitoba':'MB',
    'Saskatchewan':'SK',
    'Alberta':'AB',
    'British Columbia':'BC',
    'Yukon':'YT',
    'Northwest Territories' :'NT',
    'Nunavut':'NU'
}

In [532]:
df['State'] = df['State'].map(us_state_abbrev)

In [533]:
df.head()

Unnamed: 0,Manufactuer,Dealership,Address,City,State,Zip Code,Phone Number,Website,Dealership Group,Customer,Google Rating,# of Google Reviews,DealerRater # of Reviews,DealerRater Star Rating,DealerRater URL
0,GM,Roy Nichols Chevrolet,2728 Courtice Road,Courtice,ON,L1E 2M7,(905) 436-2222,https://www.roynicholsmotors.com/,,,,,1824 Lifetime Reviews,5.0,https://www.dealerrater.ca/dealer/Roy-Nichols-...
1,GM,Jeff Smith's County Chevrolet,224 Talbot St N,Essex,ON,N8M 2C8,(519) 776-4222,https://www.countychevroletessex.com/,,,,,6 Lifetime Reviews,3.4,https://www.dealerrater.ca/dealer/Jeff-Smith-s...
2,GM,The Humberview Group,3200 Bloor Street West,Toronto,ON,M8X 1E1,(877) 893-0990,https://www.humberviewgroup.com/,,,,,1824 Lifetime Reviews,1.8,https://www.dealerrater.ca/dealer/The-Humbervi...
3,GM,Upper Canada Motor Sales Limited,12375 Country Rd 2,Morrisburg,ON,K0C 1X0,(613) 543-2925,https://www.uppercanadamotors.com/,,,,,1 Lifetime Review,1.0,https://www.dealerrater.ca/dealer/Upper-Canada...
4,GM,Bill Spencer Chevrolet Ltd,Hwy #2 West,Cobourg,ON,K9A 4S3,(905) 372-8773,https://www.billspencerchevrolet.com/,,,,,1824 Lifetime Reviews,,https://www.dealerrater.ca/dealer/Bill-Spencer...


### Export CSV

In [534]:
df.to_csv("" + CSV_name + ".csv",index=False, header =False)

# Getting Data Review Ready

In [163]:
from random import randint
from time import sleep
import requests
import json
from bs4 import BeautifulSoup
import re

#sleep(randint(10,100))

In [167]:
df = pd.read_csv('Master North American OEM Dealerships - Alabama AL.csv')

In [168]:
State =df['State']
CSV_name = State[0]

In [169]:
df["Keywords"] = df["Dealership"].map(str) + " " + df["City "] 

In [170]:
lst = df['Dealership']
lst = [w.replace('&', 'and') for w in lst]
df['Dealership'] =lst
# & Symbols mess with the way that Google presents the dealership, so we start by replacing those symbols in the data

In [171]:
def Gog_Rat (dealer):
    try:
        site_text = requests.get("https://google.com/search?q='" + dealer + "'").text
        soup = BeautifulSoup(site_text, 'html.parser')
        sleep(randint(1,3))
        return soup.find('div', class_='BNeawe tAd8D AP7Wnd').find('span').text
    except AttributeError:
        print ("Nope")
#This is the main code to retrieve the review data. It is extremly difficult to find a better class and span 
#that are consistently reliable. It has a random 1-3 second delay built in as a precaution. This could be reduced to 
#improve the speed

In [None]:
df['reviews'] = df['Keywords'].apply(Gog_Rat)

Nope
Nope
Nope


In [None]:
def Get_Only_nums (dealer):
    a = re.sub("[^0-9.()]", "", str(dealer))
    return a
#With the addition of .find('span') in our Web Scrape this may not be neccesary, but it is a precaution

In [None]:
df['Google_Review'] = df['reviews'].apply(Get_Only_nums)

In [None]:
df['Google Rating'] = df.Google_Review.str[0:3]

In [None]:
def Get_review (dealer):
    m= re.search('\(([^)]+)', str(dealer))
    if m is None:
        return None  
    return m.group(1)

In [None]:
df['# of Google Reviews'] = df['Google_Review'].apply(Get_review)

In [None]:
df.loc[df['# of Google Reviews'].isnull(), '# of Google Reviews']='None'

# Pass Through 2
### Search with Dealership and Address

In [None]:
booleanDictionary = {'nothing': 'stuff', 'Trash': 'FALSE'}

In [None]:
df['Google Rating'] = np.where(df['# of Google Reviews'] == 'None', df['Google Rating'].map(booleanDictionary), df['Google Rating'])
# This makes sure that Google Rating and # of Google Reviews both are unified if one of the two have 'None'


In [None]:
df.loc[df['Google Rating'].isnull(), 'Google Rating']='None'


In [None]:
df["Keywords2"] = df["Dealership"].map(str) + " " + df["Address "]

In [None]:
df['reviews2'] = np.where(df['Google Rating'] == 'None', df['Keywords2'].apply(Gog_Rat), df['reviews'])

In [None]:
df['Google Rating'] = np.where(df['Google Rating'] == 'None', df['reviews2'].str[0:4], df['Google Rating'])

In [None]:
df['# of Google Reviews'] = np.where(df['# of Google Reviews']=='None', df['reviews2'].apply(Get_review), df['# of Google Reviews'])

In [None]:
df.loc[df['Google Rating'].isnull(), 'Google Rating']='None'

In [None]:
df["Keywords3"] = df["Dealership"].map(str) + " " + df["Address "]+ " "+ df["City "]

In [None]:
print (df['Google Rating'].value_counts())

# Pass Through 3
### Search with Dealership, Address, and City

In [None]:
df.loc[df['# of Google Reviews'].isnull(), '# of Google Reviews']='None'

In [None]:
df['reviews3'] = np.where(df['Google Rating'] == 'None', df['Keywords3'].apply(Gog_Rat), df['reviews2'])

In [None]:
df['Google Rating'] = np.where(df['Google Rating'] == 'None', df['reviews3'].str[0:4], df['Google Rating'])

In [None]:
df['# of Google Reviews'] = np.where(df['# of Google Reviews']=='None', df['reviews3'].apply(Get_review), df['# of Google Reviews'])

In [None]:
print (df['Google Rating'].value_counts())

# Pass Through 4 
### Only search with Dealership Name

In [None]:
df.loc[df['Google Rating'].isnull(), 'Google Rating']='None'
df.loc[df['# of Google Reviews'].isnull(), '# of Google Reviews']='None'

In [None]:
df['reviews4'] = np.where(df['Google Rating'] == 'None', df['Dealership'].apply(Gog_Rat), df['reviews3'])

In [None]:
df.loc[df['# of Google Reviews'].isnull(), '# of Google Reviews']='None'

In [None]:
df.loc[df['Google Rating'].isnull(), 'Google Rating']='None'

In [None]:
df['Google Rating'] = np.where(df['Google Rating'] == 'None', df['reviews4'].str[0:4], df['Google Rating'])

In [None]:
df['# of Google Reviews'] = np.where(df['# of Google Reviews']=='None',
                                     df['reviews4'].apply(Get_review), df['# of Google Reviews'])

In [None]:
print (df['Google Rating'].value_counts())

# Clean and Export

In [None]:
df.columns

In [None]:
del df['reviews4']
del df['Keywords']
del df['Google_Review']
del df['Keywords3']
del df['reviews']
del df['Keywords2']
del df['reviews2']
del df['reviews3']

In [None]:
df.loc[df['# of Google Reviews'].isnull(), '# of Google Reviews']='None Found'
df.loc[df['Google Rating'].isnull(), 'Google Rating']='None Found'

In [None]:
df.to_csv("" + CSV_name + " REVIEWs.csv",index=False)

### Possibly using Selenium to find if a text/code exists on a webpage

https://stackoverflow.com/questions/11454798/how-can-i-check-if-some-text-exist-or-not-in-the-page-using-selenium/11464797

In [113]:
driver.getPageSource().contains("Text which you looking for");


NameError: name 'driver' is not defined

In [112]:
from selenium import webdriver
 
# to open Firefox web browser and maximize the window
browser = webdriver.Firefox(executable_path='<Path>/geckodriver.exe')
browser.maximize_window()
 
#connect to the specific URL
browser.get("http://192.168.1.1:8090/httpclient.html")
 
assert browser.page_source.find("Cyberoam Captive Portal")

WebDriverException: Message: 'geckodriver.exe' executable needs to be in PATH. 


In [5]:
import urllib.requests
import re

html_content = urllib.requests.urlopen('http://www.domain.com').read()

matches = re.findall('regex of string to find', html_content);

if len(matches) == 0: 
   print ('I did not find anything')
else:
   print ('My string is in the html')

ModuleNotFoundError: No module named 'urllib.requests'

https://stackoverflow.com/questions/4925966/searching-through-webpage