# Scraping

In [1]:
from sre_constants import NOT_LITERAL
from urllib.parse import uses_params
import pandas as pd
import re
import numpy as np
from transformers import pipeline
import json

# import csv & xlsx

In [2]:
# read car_models.csv
df_models = pd.read_excel('car_models_edited.xlsx')
# df_models = pd.read_csv('car_models.csv')
#read brand_info.csv
df_brand = pd.read_csv('brand_info.csv')

df_models.head()

Unnamed: 0,model,review,make,extracted,make_description exist
0,ACURA MDX,"Reaching the 4th generation, the MDX became th...",ACURA,"{ ""car-model"": ""MDX"", ""year"": ""4th generation...","Innovative, Luxury, Sporty, Technological, Sle..."
1,ACURA RDX,Acura introduced a facelift for the third gene...,ACURA,"{ ""car-model"": ""RDX"", ""year"": ""2022"", ""type...","Innovative, Luxury, Sporty, Technological, Sle..."
2,ACURA TLX,No more drama and no more facelifted version f...,ACURA,"{ ""car-model"": ""Acura TLX"", ""year"": ""2021"", ...","Innovative, Luxury, Sporty, Technological, Sle..."
3,ACURA ILX,The facelifted version of the ILX brought a sp...,ACURA,"{ ""car-model"": ""ILX"", ""year"": ""facelifted ve...","Innovative, Luxury, Sporty, Technological, Sle..."
4,ACURA RLX,"In 2017, Acura introduced the facelifted versi...",ACURA,"{ ""car-model"": ""RLX"", ""year"": ""2017"", ""type...","Innovative, Luxury, Sporty, Technological, Sle..."


Combine model overall designs with make description

In [3]:
# find all unique make in df_models
make = df_models['make'].unique()
# print(make)

overall_design = {}
#loop through all makes
for m in make:
    # print(m)
    design = []
        
    # get all models of the make
    df_make = df_models[df_models['make'] == m]
    # read dictionary from the "extracted" column
    dict_extracted = df_make['extracted'].to_dict()
    # print(dict_extracted)
    #extract the "overall-design" value from all the entries in the dictionary
    for key, value in dict_extracted.items():
        #convert value string to dictionary
        try:
            extracted_dict = json.loads(value)
        except Exception as e:
            print(e)
            print(key,value)

        if 'overall-design' in extracted_dict:
            design+=extracted_dict['overall-design']
    # print(df_models[df_models['make'] == m]['make_description exist'].unique())
    make_str = df_models[df_models['make'] == m]['make_description exist'].unique()
    #convert strings to list, split by comma and remove spaces if any
    make_list = make_str[0].split(',')
    make_list = [x.strip() for x in make_list]
    # print(make_list)

    overall_design[m] = design + make_list
    
# export json
with open('json/overall_design.json', 'w') as f:
    json.dump(overall_design, f)


# BERT (Relevance)

In [4]:
# zero shot classification for positive sentiment
classifier = pipeline("zero-shot-classification")

def get_relevance(text):
    categories = ['Aggressive', 'Dynamic','Elegant','Friendly', 'Modern', 'Powerful' , 'Sporty', 'Stable']
    result = classifier(text, categories)
    return result

No model was supplied, defaulted to facebook/bart-large-mnli and revision c626438 (https://huggingface.co/facebook/bart-large-mnli).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [5]:
# test
# get_relevance("bespoke luxurious unique dynamic demanding elegant majestic dominant flush-to-bodywork environmentally-friendly Luxury, Comfortable, Elegant, Classic, Expensive, Heritage, Quality elegant supple harmonious luxurious quiet")

In [6]:
#make a list of strings to one string
def list_to_string(s):  
    # initialize an empty string 
    str1 = " " 
    # return string   
    return (str1.join(s))

with open('json/overall_design.json', 'r') as f:
    overall_design = json.load(f)

# get the relevance score for each make
def get_relevance_score(make):
    # read overall_design.json
    # print(overall_design[make])
    # convert list to string
    make_str = list_to_string(overall_design[make])
    # print(make_str)
    # get relevance score
    relevance = get_relevance(make_str)
    print(make, relevance)
    return relevance

make_design = {}
for key,value in overall_design.items():
    # print(key)
    relevance = get_relevance_score(key)
    make_design[key] = relevance

# export json
with open('json/make_design.json', 'w') as f:
    json.dump(make_design, f)
    

ACURA {'sequence': 'bold aggressive stylish performance-oriented radical dynamic angular aggressive sharp lines sportier refreshed upgraded facelifted improved flagship sports sedan premium sporty electric revolutionary sportscar appearance angular front fascia arched roofline luxurious cabin sporty angular premium youthful family-friendly Innovative Luxury Sporty Technological Sleek Sharp', 'labels': ['Aggressive', 'Powerful', 'Dynamic', 'Sporty', 'Modern', 'Friendly', 'Elegant', 'Stable'], 'scores': [0.47808903455734253, 0.2407149076461792, 0.16600890457630157, 0.10449627041816711, 0.005839053075760603, 0.0027359179221093655, 0.001452585100196302, 0.0006633119191974401]}
ALFA ROMEO {'sequence': 'sports compact premium refreshed customizable improved spectacular slimmer sharper refreshed sporty attractive stylish aggressive muscular sporty distinct Stylish exotic sporty modern attractive sleek sporty modern bold spirited fast sharp fluid sculptured beautiful aggressive-looking lightwe