In [1]:
import pandas as pd
import numpy as np
import scipy
import re
import csv
import operator
from sklearn.metrics import confusion_matrix
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression 
import ast

In [2]:
df = pd.read_csv("10361.csv",encoding='ISO-8859-1') 

In [3]:
df.head(1)

Unnamed: 0,author,date,vehicle,overall_rate,text,total_category,category_rates
0,Daniel,"Updated on Dec 4, 2016",2016 Acura ILX,4,The car drives fantastically. It's sporty enou...,13,"{'Value': 4.0, 'Exterior': 3.0, 'Ride': 4.5, '..."


In [4]:
len(df.category_rates)

10361

In [5]:
len(df)

10361

# Functions to clean text

In [6]:
def CleanText(raw_comment):
    # 1. lower case
    new_comment = raw_comment.lower()
    # 2. remove punctuation
    new_comment = re.sub(r"[^\w\s]", "", new_comment)
    
    return new_comment


#Remove stop words
stop_words = set(stopwords.words('english'))
re_stop_words = re.compile(r"\b(" + "|".join(stop_words) + ")\\W", re.I)
def removeStopWords(sentence):
    global re_stop_words
    return re_stop_words.sub(" ", sentence)


#Stemming
stemmer = SnowballStemmer("english")
def stemming(sentence):
    stemSentence = ""
    for word in sentence.split():
        stem = stemmer.stem(word)
        stemSentence += stem
        stemSentence += " "
    stemSentence = stemSentence.strip()
    return stemSentence

In [7]:
df['textclean'] = df['text'].apply(CleanText)
df['textclean'] = df['textclean'].apply(removeStopWords)
df['textclean'] = df['textclean'].apply(stemming)

In [8]:
print(df.head(1)['text'])

0    The car drives fantastically. It's sporty enou...
Name: text, dtype: object


In [9]:
#l = 'powerful engine is a good'
#stemming(l)

# Aspect words lists

In [27]:
wordslist = {}
wordslist['Technology'] = ['tech','technologically', 'phone', 'bluetooth', 'power', 'navigation', 'acc', 
                           'android', 'touch', 'touchscreen',
                           'cruise', 'navi', 'software', 'horsepower', 'camera', 'radio', 'entertainment' , 
                           'remote', 'lane', 'display', 'push button start',
                           'infotainment', 'voice', 'electronic', 'sensor', 'touch button', 'connect', 'push button', 
                           'nav', 'iphone', 'smartphone', 'map', 'connection', 'cellphone','keyless',  
                           'connectivity', 'computer', 'gps', 'hands free', 'feature','apple','carplay', 'ac','air condition']
wordslist['Audio'] = ['audio', 'speaker', 'sound', 'stereo', 'aux', 'media', 'mp3 player', 'player', 'cd','music',
                     'song','loud']
wordslist['Performance'] = ['engine', 'automatic', 'transmission', 'acceleration', 'accelerate', 'performance', 'steering', 
                            'gasket', 'cylinder', 'road handling', 'gear', 'gearbox', 'awd', 'front wheel drive', 'handle', 'wheel drive', 'engineer', 'exhaust',  
                            'clean diesel', 'town driving', 'handling', 'auto open', 'auto shut', 'drivetrain', 'automate', 'tow', 'diesel', 'haul', 'gearshift']

wordslist['Exterior'] = ['window', 'door', 'headlight', 'windshield', 'exterior', 'wheel', 'tire', 'wiper', 'color', 'style', 
                         'liftgate', 'paint', 'tailgate', 'lift gate', 'run flats', 'flat tires', 'styling','style', 'rear gate', 'tail gate', 'headlamp']

wordslist['Ride'] = ['ride']

wordslist['Seating'] = ['seat', 'seatbelt', 'heat', 'lumbar', 'backseat', 'memory']

wordslist['Interior'] = ['trunk', 'storage', 'light', 'cargo', 'space', 'headroom', 'gas pedal', 'steering wheel', 'interior', 'mirror', 'moonroof', 'sunroof', 
                         'sunglass', 'cup holders',  'door pockets', 'dashboard', 'roof', 'door lock', 'sun visor', 'sunvisor', 'roomy', 'room', 'center dash', 
                         'rear view', 'console','glove compartment', 'high beam', 'legroom', 'glovebox',  'glove box', 'steering column', 'headrest', 'black dash', 
                         'lock', 'dash board', 'unlock', 'front panel', 'taillight', 'floor mat', 'cigarette lighter '] 

wordslist['Efficiency']  = ['mileage', 'gas', 'hybrid', 'fuel', 'efficient', 'fuel economy', 'battery charge', 'battery', 'tank', 'fuel consumption']

wordslist['Reliability'] = ['noise', 'reliability', 'reliable', 'rattle', 'rattling', 'shake']

wordslist['Safety'] = ['safe','quality', 'safety', 'change lane', 'brake', 'camera', 'air bag', 'airbag', 'lane assistant', 'lane departure', 'sensor', 'blind spot', 'warning']

wordslist['Comfort'] = ['heat', 'comfortably', 'seat', 'comfortable', 'luxury', 'lumbar support', 'air conditioning', 'uncomfortable', 'climate control', 'cool system', 'air vents', 'comfort', 
                        'cooled seats', 'convenience', 'armrest', 'headrest', 'backseat', 'arm rest', 'memory', 'seatback', 'carseat', 'cooling system', 'seatbelt', 
                        'airflow', 'temperature gauge', ]

wordslist['Value'] = ['value','useful','expensive','cheap','price', 'low', 'money', 'wallet']


wordslist['Satisfaction'] = ['like', 'good', 'handsome' ,'excellent','great','perfect','gross', 'lovely', 'hate','incomplete','happy', 'nice', 'satisfy', 'cheesy','beautiful', 'positive','satisfactory','wonderful',
                      'fun', 'impressive', 'bad', 'love', 'dissatisfied', 'angry', 'extraordinary','disappointed','horrible', 'suck','annoy','pleasure','complain','angry','annoyed', 'afraid', 'awkward', 'affectionate',
                             'anxious', 'alarmed', 'awed', 'aggravated', 'amazed', 'astonished', 'amused', 'apprehensive', 'absorbed', 'ambivalent', 'ashamed', 'able', 'addled', 'admired', 'admirable', 
                             'affable', 'agreeable', 'aggressive', 'abandoned','destroy','worst','shame','best','amazing','enjoy','ugly','terrific','glad','decent','regret','pleased','awful','detriment','ridiculous','negative']

In [28]:
stemlist = {}
for k in wordslist:
    for word in wordslist[k]:
        #print(word)
        stem = stemmer.stem(word)
        stemlist.setdefault(k, []).append(stem)

In [29]:
print(stemlist)

{'Technology': ['tech', 'technolog', 'phone', 'bluetooth', 'power', 'navig', 'acc', 'android', 'touch', 'touchscreen', 'cruis', 'navi', 'softwar', 'horsepow', 'camera', 'radio', 'entertain', 'remot', 'lane', 'display', 'push button start', 'infotain', 'voic', 'electron', 'sensor', 'touch button', 'connect', 'push button', 'nav', 'iphon', 'smartphon', 'map', 'connect', 'cellphon', 'keyless', 'connect', 'comput', 'gps', 'hands fre', 'featur', 'appl', 'carplay', 'ac', 'air condit'], 'Audio': ['audio', 'speaker', 'sound', 'stereo', 'aux', 'media', 'mp3 player', 'player', 'cd', 'music', 'song', 'loud'], 'Performance': ['engin', 'automat', 'transmiss', 'acceler', 'acceler', 'perform', 'steer', 'gasket', 'cylind', 'road handl', 'gear', 'gearbox', 'awd', 'front wheel dr', 'handl', 'wheel driv', 'engin', 'exhaust', 'clean diesel', 'town driv', 'handl', 'auto open', 'auto shut', 'drivetrain', 'autom', 'tow', 'diesel', 'haul', 'gearshift'], 'Exterior': ['window', 'door', 'headlight', 'windshield'

# use stemed text and wordslist

In [30]:
type(df['text'])
len(df)

10361

In [31]:
df['textclean'][10065]

'consol restrict leg room right leg drive find lean get consol side turn signal wiper lever close steer wheel sometim hit make turn move hand along wheelorigin review jun 11 2016turn signal lever wiper lever close steer wheel hit make turn'

In [32]:
def containword(a):
    valuelist = []
    for i in range(0,len(df)):
    
        count = 0
        while count < len(stemlist[a]):
            if stemlist[a][count] in df['textclean'][i]:
                valuelist.append('Y')
                break;
            else:
                count += 1
        if count == len(stemlist[a]):
            valuelist.append('N')
            
    return valuelist

In [33]:
#l = 'Best choice, investment in awhile. Love it. Even wife agrees.'
print(df.loc[10065]['text'])
l = df.loc[10065]['text']
alist = []

m = stemming(l)
print(m)
for a in stemlist:
    count = 0
    while count < len(stemlist[a]):
        if stemlist[a][count] in m:
                alist.append('Y')
                break;
        else:
                count += 1
    if count == len(stemlist[a]):
        alist.append('N')
print(alist)

Console restricts leg room for right leg while driving; find myself leaning my get on console side. Turn Signal and wiper lever are very close to steering wheel; sometimes I hit them while making turns or moving my hands along the wheel.Originally Reviewed on Jun 11, 2016Turn signal lever and Wiper lever are too close to the steering wheel. I hit them when making turns.
consol restrict leg room for right leg while driving; find myself lean my get on consol side. turn signal and wiper lever are veri close to steer wheel; sometim i hit them while make turn or move my hand along the wheel.origin review on jun 11, 2016turn signal lever and wiper lever are too close to the steer wheel. i hit them when make turns.
['N', 'N', 'Y', 'Y', 'N', 'N', 'Y', 'N', 'N', 'N', 'N', 'N', 'N']


In [34]:
for k in stemlist:
    #print('see what will print')
    valuelist = containword(k)
    df[k] = np.asarray(valuelist)

In [35]:
df.head(3)

Unnamed: 0,author,date,vehicle,overall_rate,text,total_category,category_rates,textclean,Technology,Audio,...,Ride,Seating,Interior,Efficiency,Reliability,Safety,Comfort,Value,Satisfaction,fourdescribtionlist
0,Daniel,"Updated on Dec 4, 2016",2016 Acura ILX,4,The car drives fantastically. It's sporty enou...,13,"{'Value': 4.0, 'Exterior': 3.0, 'Ride': 4.5, '...",car drive fantast sporti enough need econom en...,Y,Y,...,Y,Y,Y,Y,Y,Y,Y,Y,Y,13
1,Sara,"Updated on Sep 12, 2016",2016 Acura ILX,5,This is a great car but beware before purchasi...,13,"{'Audio': 5.0, 'Technology': 5.0, 'Value': 3.0...",great car bewar purchas im 3000 mile tire almo...,Y,Y,...,N,Y,N,N,Y,N,Y,Y,Y,9
2,Robert,22-Aug-16,2016 Acura ILX,5,"Great size. Not to big, not too small. Because...",13,"{'Audio': 4.5, 'Technology': 4.0, 'Value': 4.5...",great size big small 2 honda accord last one l...,Y,Y,...,N,Y,Y,N,N,N,Y,N,Y,8


# count how much describtion: at leat 4

In [36]:
aspectlist = ['Technology', 'Audio', 'Performance', 'Exterior', 'Ride', 'Seating', 'Interior', 'Efficiency', 'Reliability', 'Safety', 'Comfort', 'Value', 'Satisfaction']

In [37]:
fourdescribtionlist = []
for i in range(0, len(df)):
    count = 0
    for aspect in aspectlist:
        if df.loc[i][aspect] == "Y":
            count += 1
    fourdescribtionlist.append(count)
            
        
        
    

In [38]:
df.loc[10065]['Audio']

'N'

In [39]:
len(fourdescribtionlist)

10361

In [40]:
df['fourdescribtionlist'] = np.asarray(fourdescribtionlist)

In [41]:
df.head(3)

Unnamed: 0,author,date,vehicle,overall_rate,text,total_category,category_rates,textclean,Technology,Audio,...,Ride,Seating,Interior,Efficiency,Reliability,Safety,Comfort,Value,Satisfaction,fourdescribtionlist
0,Daniel,"Updated on Dec 4, 2016",2016 Acura ILX,4,The car drives fantastically. It's sporty enou...,13,"{'Value': 4.0, 'Exterior': 3.0, 'Ride': 4.5, '...",car drive fantast sporti enough need econom en...,Y,Y,...,Y,Y,Y,Y,Y,Y,Y,Y,Y,13
1,Sara,"Updated on Sep 12, 2016",2016 Acura ILX,5,This is a great car but beware before purchasi...,13,"{'Audio': 5.0, 'Technology': 5.0, 'Value': 3.0...",great car bewar purchas im 3000 mile tire almo...,Y,Y,...,N,Y,N,N,Y,N,Y,Y,Y,9
2,Robert,22-Aug-16,2016 Acura ILX,5,"Great size. Not to big, not too small. Because...",13,"{'Audio': 4.5, 'Technology': 4.0, 'Value': 4.5...",great size big small 2 honda accord last one l...,Y,Y,...,N,Y,Y,N,N,N,Y,N,Y,8


In [42]:
df.loc[10065]

author                                                            Sydney
date                                              Updated on Aug 6, 2016
vehicle                                           2016 Volkswagen Passat
overall_rate                                                           3
text                   Console restricts leg room for right leg while...
total_category                                                        13
category_rates         {'Value': 3.0, 'Satisfaction': 3.0, 'Comfort':...
textclean              consol restrict leg room right leg drive find ...
Technology                                                             N
Audio                                                                  N
Performance                                                            Y
Exterior                                                               Y
Ride                                                                   N
Seating                                            

In [43]:
df.to_csv(r'/Users/liyanan/Desktop//trytrytry/6-paper/describeaspects.csv')

In [33]:
 fouraspectlist = []
for i in df.category_rates:
        j = ast.literal_eval(i)
        if j.get('Technology', 0) != 0 and j.get('Performance', 0) != 0 and j.get('Interior',0)!= 0 and j.get('Comfort',0) != 0:
            fouraspectlist.append('Y')
        else:
            fouraspectlist.append('N')
            
        
        

In [34]:
len(fouraspectlist)

10066

In [35]:
print(fouraspectlist)

['Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',

In [36]:
df['fouraspectlist'] = np.asarray(fouraspectlist)
    

In [37]:
df.head(3)

Unnamed: 0,author,date,vehicle,overall_rate,text,total_category,category_rates,fouraspectlist
0,Daniel,"Updated on Dec 4, 2016",2016 Acura ILX,4,The car drives fantastically. It's sporty enou...,13,"{'Value': 4.0, 'Exterior': 3.0, 'Ride': 4.5, '...",Y
1,Sara,"Updated on Sep 12, 2016",2016 Acura ILX,5,This is a great car but beware before purchasi...,13,"{'Audio': 5.0, 'Technology': 5.0, 'Value': 3.0...",Y
2,Robert,22-Aug-16,2016 Acura ILX,5,"Great size. Not to big, not too small. Because...",13,"{'Audio': 4.5, 'Technology': 4.0, 'Value': 4.5...",Y


In [38]:
df.to_csv(r'/Users/liyanan/Desktop//trytrytry/3-scrape_more/describe4apects_2.csv')