In [1]:
import random
import pandas as pd
import numpy as np
import nltk
import spacy
import sklearn
import multiprocessing
import gensim.models
import gensim.downloader as api


from gensim import utils
from unidecode import unidecode
from nltk.stem import PorterStemmer
from gensim.test.utils import datapath
from gensim.models.keyedvectors import KeyedVectors
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models.fasttext import FastText as FT_gensim
from gensim.parsing.preprocessing import strip_punctuation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDiA
from gensim.parsing.preprocessing import strip_multiple_whitespaces


import pre


np.random.seed(42)

In [30]:
print('Pandas version: {}'.format(pd.__version__))
print('Sklearn version: {}'.format(sklearn.__version__))
print('Numpy version: {}'.format(np.__version__))

files = ['Audi.csv', 'BMW.csv', 'Chrysler.csv', 'Ford.csv', 'GMC.csv', 'Honda.csv', 
         'Mercedes-Benz.csv', 'Nissan.csv', 'Toyota.csv']

Pandas version: 0.25.2
Sklearn version: 0.22.1
Numpy version: 1.17.3


In [61]:
# concatenate all dfs and pick different makes to individual DF.

# concatenate
df_0 = pd.read_csv('2_location_added/merged0.csv')
df_1 = pd.read_csv('2_location_added/merged1.csv')
df_2 = pd.read_csv('2_location_added/merged2.csv')
df_3 = pd.read_csv('2_location_added/merged3.csv')

df_final = df_0.append(df_1, sort=False, ignore_index=True)
df_final = df_final.append(df_2, sort=False, ignore_index=True)
df_final = df_final.append(df_3, sort=False, ignore_index=True)

# get individual DF 
def get_make_df(*kw):
    for file in kw:
        make = file.split('.')[0]
        df = df_final[df_final['Make']==make]
        df.dropna(inplace=True)
        df['Comments'] = df['Comments'].apply(lambda x: str(x).replace('-PRON-', ''))
        df.to_csv('Make_data/{}.csv'.format(make), index=False)

get_make_df(*files)

In [62]:
s = pd.read_csv('Make_data/Audi.csv')
s.head()

Unnamed: 0,Make,Model,Year,Comments,Label,Date,Ori_comments,country,state
0,Audi,100,1993,drive flat temp stay normal go hill temp start...,cooling_system,Feb 04 2006,"[""' when driving on the flat the temp stays no...",scotland,inverclyde
1,Audi,100,1992,car hard start cold warm car start ok jim c,engine,Jan 03 2007,"[""' car is hard starting when cold, after warm...",us,ks
2,Audi,100,1991,m tired cat go love car s real wish enjo...,engine,Feb 20 2008,"[""' I'm so tired of this cat if I could it wou...",us,il
3,Audi,200,1986,the clutch liquid leaking the problem piston ...,clutch,Jul 14 2009,"[""' The clutch liquid was leaking. The problem...",australia,qb
4,Audi,4000,1987,suddenly hard time start rpms 10 only all old...,engine,Aug 07 2003,"[""' Suddenly hard time starting, RPMs are at 1...",us,ct


In [63]:
# 80/20 split

def split_data(file):
    '''
    first need to drop nan rows and reset the index, which inherit form original DF to 0 ... n.
    '''
    df = pd.read_csv('Make_data/{}'.format(file))
    df = df.dropna(axis=0, how='any')
    df.reset_index(inplace=True, drop=True)
    
#     return df 
    df_train, df_test = train_test_split(df, test_size=0.20, random_state=1)
    
    df_train.to_csv('Make_data/train/{}_train.csv'.format(file.split('.')[0]), index=False)
    df_test.to_csv('Make_data/test/{}_test.csv'.format(file.split('.')[0]), index=False)

    
for file in files:
    split_data(file)


In [64]:
# *_topic_score.csv is a file saved whole matrix of topic score by LDiA model 

def topic_generator(file):

    df_train = pd.read_csv('Make_data/train/{}_train.csv'.format(file.split('.')[0]))

    corpus = []
    for i in range(df_train.shape[0]):
        corpus.append(df_train['Comments'][i])
    
#   BOW
#   LDiA works with raw BOW count vectors rather than normalized TF-IDF vectors -- NLP in action.
    count_vect = CountVectorizer()
    df_bow = pd.DataFrame(data=count_vect.fit_transform(raw_documents=corpus).toarray())
    column_nums, terms = zip(*sorted(zip(count_vect.vocabulary_.values(), count_vect.vocabulary_.keys())))
    df_bow.column_nums = terms
    
#   ldia generate topics
    ldia = LDiA(n_components=10)
    ldia.fit_transform(df_bow)
    
    df_topics = pd.DataFrame(data=ldia.components_.T, index=terms)
    
    df_topics.to_csv('Make_data/train/topics/{}_topic_score.csv'.format(file.split('.')[0]))

#  run   
for file in files:
    topic_generator(file)

# topic_generator(files[0])

In [65]:
# *_word_matrix.csv is a word-topic matrix, 9 row is 9 word of each topic, and 10 cols means 10 topic files  

def topic_words(file):
    df = pd.read_csv('Make_data/train/topics/{}_topic_score.csv'.format(file.split('.')[0]), index_col='Unnamed: 0')
    df_words = pd.DataFrame()
    for i in df.columns:
        df_words['File{}'.format(i)] = list(df[i].nlargest(9).index)
        
    df_words.to_csv('Make_data/train/topics/{}_word_matrix.csv'.format(file.split('.')[0]), index=False)

# run    
for file in files:
    topic_words(file)

In [66]:
# *_score_matrix.csv is save format with above cell but score instide of words.
def topic_score(file):
    df = pd.read_csv('Make_data/train/topics/{}_topic_score.csv'.format(file.split('.')[0]), index_col='Unnamed: 0')
    df_score = pd.DataFrame()
    for i in df.columns:
        df_score['File{}'.format(i)] = list(df[i].nlargest(9))
    df_score.to_csv('Make_data/train/topics/{}_score_matrix.csv'.format(file.split('.')[0]), index=False)
    
# run
for file in files:
    topic_score(file)

In [95]:
a = pd.read_csv('Make_data/train/topics/Audi_score_matrix.csv')
a

Unnamed: 0,File0,File1,File2,File3,File4,File5,File6,File7,File8,File9
0,296.694651,316.8719,4883.774757,932.03779,3731.152353,1035.939394,2406.539052,546.090868,1279.959273,1117.794642
1,289.099946,254.26976,2725.035941,700.165297,3626.054335,845.224447,1354.772314,485.797307,618.152024,657.768558
2,150.099963,176.679438,1771.521104,579.767382,1285.997126,693.974471,1287.298285,408.598726,503.877528,517.379585
3,146.099975,162.261046,1382.83982,454.456319,1076.50534,690.837063,987.97794,361.414361,472.478736,265.479555
4,135.60667,133.813912,1025.714464,446.530526,1070.864511,614.457676,922.979923,353.341372,457.014102,255.172327
5,114.130682,130.792126,1016.14441,389.942989,1020.176169,613.216522,715.105756,336.773774,440.120868,235.213137
6,104.140337,103.129619,987.034686,331.678393,881.011663,558.651682,673.429447,308.160217,418.629723,168.999072
7,101.001176,91.302352,941.118678,313.734026,786.776135,455.870697,640.050979,275.808483,363.001969,158.461919
8,98.476156,88.479413,847.281864,284.901493,777.450528,405.968433,610.809738,223.605868,362.84512,133.683154


In [168]:
# use LDiA to get topic words in test sets

n = 1001

test = pd.read_csv('Make_data/test/Audi_test.csv')
cur_cop = [test['Comments'][n]]
count_vect = CountVectorizer()
df_bow = pd.DataFrame(data=count_vect.fit_transform(raw_documents=cur_cop).toarray())
column_nums, terms = zip(*sorted(zip(count_vect.vocabulary_.values(), count_vect.vocabulary_.keys())))
df_bow.column_nums= terms

#   ldia generate topics
ldia = LDiA(n_components=1)
ldia.fit_transform(df_bow)

df_topics = pd.DataFrame(data=ldia.components_.T, index=terms)
test_score = df_topics[0].nlargest(9)

In [169]:
# np.dot(test_score,)
train_mat = pd.read_csv('Make_data/train/topics/Audi_score_matrix.csv')
# train_mat
np.dot(np.array(test_score).reshape(1, -1), train_mat)
# cosine_similarity(train_mat.T, np.array(test_score).reshape(1, -1))

array([[ 3752.69364812,  3825.28127733, 41924.10306899, 11532.85521563,
        38231.68547314, 15094.25614366, 25234.51445505,  8401.08321732,
        12706.6262428 ,  9578.32624192]])

In [170]:
s = pd.read_csv('Make_data/test/Audi_test.csv')
print(s['Comments'][n] + '\n')
print(s['Ori_comments'][n])
print(s['Label'][n])
pd.read_csv('Make_data/train/topics/Audi_word_matrix.csv')
# s.head()

oem wiper provide audi 2008 5 a4 model year onwards water slush freeze solid winter render useless create significant driving hazard audi replacement non responsive further third party replacement winter style blade proprietary blade arm connector

['Oem wipers provided by Audi on 2008.5 A4 model year onwards fill with water/slush and freeze solid in the winter rendering them useless and creating significant driving hazard. Audi has no replacement and is non-responsive. Further, there are no third-party replacement winter-style blades because it is a proprietary blade-arm connector. Arlington, MA, USA']
windows_windshield


Unnamed: 0,File0,File1,File2,File3,File4,File5,File6,File7,File8,File9
0,control,belt,the,car,audi,audi,car,problem,fuel,tire
1,arm,timing,contact,light,car,recall,vehicle,consumer,pump,brake
2,rod,engine,vehicle,seat,oil,light,drive,vehicle,vehicle,audi
3,tie,chain,failure,the,engine,headlight,the,the,ignition,problem
4,replace,tensioner,repair,brake,mile,problem,stop,audi,engine,replace
5,audi,audi,audi,driver,problem,issue,audi,cluster,recall,mile
6,consumer,mile,manufacturer,vehicle,issue,abs,transmission,instrument,tank,dealer
7,noise,failure,state,come,service,car,brake,state,replace,rotor
8,recall,cause,recall,passenger,the,drive,speed,replace,leak,wheel


## header replace

In [334]:
# Hand made labels

# honda
# honda_header = ['transmission', 'engine/seat', 'break', 'safety', 'door', 'service', 'service', 'service',
#                 'wheel', 'batery/body']
# df_honda = pd.read_csv('Make_data/train/topics/Honda_word_matrix.csv')
# df_honda.columns = honda_header

# # audi
# audi_header = ['engine', 'fuel syst', 'service', 'service', 'service', 'engine', 'safety',
#               'door/gas', 'break', 'wheel/transmission']
# df_audi = pd.read_csv('Make_data/train/topics/Audi_word_matrix.csv')
# df_audi.columns = audi_header

# # bmw
# bmw_header = ['wheel', 'safety', 'service', 'service', 'door', 'service', 'power/fuel', 'safety',
#              'transmission', 'safety']
# df_bmw = pd.read_csv('Make_data/train/topics/BMW_word_matrix.csv')
# df_bmw.columns = bmw_header

# # chrysler
# # shift maybe means transmission?
# chrysler_header = ['door/seat', 'steering', 'service', 'transmission', 'horn/engine', 'engine/oil', 'wheel',
#                   'engine', 'mechanical', 'tire/']
# df_chrysler = pd.read_csv('Make_data/train/topics/Chrysler_word_matrix.csv')
# df_chrysler.columns = chrysler_header

# # ford
# ford_header = ['door', 'wheel/body', 'safety', 'service', 'fuel syst', 'transmission', 'unknown', 
#                'engine', 'wheel', 'truck']
# df_ford = pd.read_csv('Make_data/train/topics/Ford_word_matrix.csv')
# df_ford.columns = ford_header

# # gmc
# gmc_header = ['engine', 'windshield', 'door', 'service', 'safety', 'service',
#              'speedometer', 'fuel syst', 'engine/tire', 'engine/body']
# df_gmc = pd.read_csv('Make_data/train/topics/GMC_word_matrix.csv')
# df_gmc.columns = gmc_header

# # mb
# mb_header = ['safety', 'service', 'unknown', 'engine', 'pedal', 'fuel syst', 'wheel/bulb', 
#              'unknown', 'steering', 'transmission']
# df_mb = pd.read_csv('Make_data/train/topics/MB_word_matrix.csv')
# df_mb.columns = mb_header

# # nissan
# nissan_header = ['transmission', 'fuel syst', 'windshield', 'driving', 'unknown', 
#                 'service', 'door/wheel', 'transmission/truck', 'transmission', 'safety']
# df_nissan = pd.read_csv('Make_data/train/topics/Nissan_word_matrix.csv')
# df_nissan.columns = nissan_header

# # toyota
# toyota_header = ['wheel', 'frame', 'pedal', 'service', 'unknown', 'safety', 
#                 'service', 'engine', 'engine', 'service']
# df_toyota = pd.read_csv('Make_data/train/topics/Toyota_word_matrix.csv')
# df_toyota.columns = toyota_header


## LDiA results

In [3]:
rand_index = random.randrange(test.shape[0])
print(rand_index)

# got test file in bow shape. Row are files, cols are words. 
test = pd.read_csv('Make_data/test/Audi_score.csv')

train = pd.read_csv('Make_data/train/topics/Audi_score_matrix.csv')

cosine_similarity(train.T, test.iloc[rand_index].nlargest(9).values.reshape(1,-1))

NameError: name 'test' is not defined

In [335]:
df_audi

Unnamed: 0,engine,fuel syst,service,service.1,service.2,engine.1,safety,door/gas,break,wheel/transmission
0,brake,fuel,audi,vehicle,recall,light,seat,car,audi,car
1,coil,audi,car,contact,contact,audi,passenger,start,problem,drive
2,car,vehicle,mile,failure,audi,problem,driver,gas,light,tire
3,vehicle,pump,oil,state,repair,engine,vehicle,time,abs,stop
4,stop,engine,problem,dealer,available,car,car,happen,replace,audi
5,ignition,car,issue,audi,manufacturer,headlight,airbag,turn,failure,time
6,because,tank,service,mileage,takata,drive,rear,door,fail,vehicle
7,drive,problem,dealer,000,nhtsa,issue,because,problem,cluster,speed
8,dealer,leak,tell,repair,receive,come,window,lock,module,transmission


In [337]:
comment = pd.read_csv('Make_data/test/Audi_test.csv')
print(comment['Comments'][392])



## DL

Training model with data from Car Complains

In [458]:
# save original data to a corpus list

files = ['Audi.csv', 'BMW.csv', 'Chrysler.csv', 'Ford.csv', 'GMC.csv', 'Honda.csv', 
         'Mercedes-Benz.csv', 'Nissan.csv', 'Toyota.csv']

sp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
ps = PorterStemmer()

def simple_processor(token):
    str = unidecode(token)
    str = strip_punctuation(str)
    str = strip_multiple_whitespaces(str)
    
    tokens = str.split(' ')
    tokens = [ps.stem(token) for token in tokens]
    str = ' '.join(tokens)
    return str

def re_pre(file):
    df = pd.read_csv('Make_data/{}'.format(file))
    
    corpus = []
    for comment in df['Ori_comments']:
        corpus.append(simple_processor(comment))
    
    with open('Corpus_data/{}_corpus.txt'.format(file.split('.')[0]), mode='w', encoding='utf-8') as f:
        for line in corpus:
            f.write(line + '\n')
        
for file in files:
    re_pre(file)

In [180]:
worker = multiprocessing.cpu_count()    # num of processors
path_list = ['Corpus_data/{}_corpus.txt'.format(file.split('.')[0]) for file in files]      # corpus path

class MyCorpus(object):
    """An interator that yields sentences (lists of str)."""
    
#     def __init__(self):
#         self.path = path
        
    def __iter__(self):
        for line in open(path):
            # assume there's one document per line, tokens separated by whitespace
            yield utils.simple_preprocess(line)

# training 
for path_ in path_list:
    path = path_ 
    sentence = MyCorpus()
    model_w2v = gensim.models.Word2Vec(sentences=sentence,
                                       workers=worker,
                                       sg=0,
                                       size=300)
    
    model_name = '{}_Word2Vect_model'.format(path.split('/')[1].split('_')[0])
    model_w2v.save('pre_trained/{}'.format(model_name))
    
# model_w2v.wv.vocab

In [49]:
ps = PorterStemmer()

model_path = 'pre_trained/W2V/BMW_Word2Vect_model'
model_w2v = gensim.models.Word2Vec.load(model_path)
model_w2v.wv.most_similar('{}'.format(ps.stem('pedal')), topn=10)

[('depress', 0.7829334735870361),
 ('press', 0.7527486085891724),
 ('peddl', 0.7344381809234619),
 ('appli', 0.7320843935012817),
 ('harder', 0.7017271518707275),
 ('foot', 0.6738139390945435),
 ('push', 0.6158410906791687),
 ('clutch', 0.6024237871170044),
 ('acceler', 0.5961849093437195),
 ('engag', 0.589065670967102)]

FastText Model

In [None]:
# training 

for path_ in path_list:
    path = path_ 
    model_ft = FT_gensim(size=100)
    # build the vocabulary
    model_ft.build_vocab(corpus_file=path)
    # train the model
    model_ft.train(
        corpus_file=path, epochs=model_ft.epochs,
        total_examples=model_ft.corpus_count, total_words=model_ft.corpus_total_words
    )
    model_name = 'pre_trained/FastText/{}_FastText_model'.format(path.split('/')[1].split('_')[0])
    model_ft.save(model_name)

In [46]:
# FT load pre_trained
ft_model = FT_gensim.load('pre_trained/FastText/Honda_FastText_model')
ft_model.most_similar(positive='transmiss', topn=20)

[('ntransmiss', 0.985079824924469),
 ('transmis', 0.9763768315315247),
 ('transmisson', 0.9520970582962036),
 ('transm', 0.9388997554779053),
 ('transmit', 0.9368417263031006),
 ('transmitt', 0.9245721101760864),
 ('transimiss', 0.9070486426353455),
 ('tranmiss', 0.8943817615509033),
 ('trasmiss', 0.8651485443115234),
 ('transaxl', 0.854050874710083),
 ('tran', 0.8468038439750671),
 ('transistor', 0.8428468704223633),
 ('tranni', 0.8410258293151855),
 ('translat', 0.8358073234558105),
 ('transient', 0.8241763114929199),
 ('odysseytransmiss', 0.8197256326675415),
 ('transact', 0.7909071445465088),
 ('transcript', 0.7709774374961853),
 ('transit', 0.7564530968666077),
 ('transpar', 0.7528682947158813)]

### most similar words table

In [44]:
files = ['Audi.csv', 'BMW.csv', 'Chrysler.csv', 'Ford.csv', 'GMC.csv', 'Honda.csv', 
         'Mercedes-Benz.csv', 'Nissan.csv', 'Toyota.csv']

sample_word = 'transmission'

ps = PorterStemmer()
stemed_sample_word = ps.stem(sample_word)

def similar_word_table(token, files):
    df = pd.DataFrame()
    for file in files:
        model_path = 'pre_trained/W2V/{}_Word2Vect_model'.format(file.split('.')[0])
        model = gensim.models.Word2Vec.load(model_path)
        
        words = []
        similmar_list = model.most_similar(token)
        for word in similmar_list:
            words.append(word[0])
        df['{}'.format(file.split('.')[0])] = words
        
    df.to_csv('sample_words.csv', index=False)
    
similar_word_table(stemed_sample_word, files)
    

In [45]:
a = pd.read_csv('sample_words.csv')
a

Unnamed: 0,Audi,BMW,Chrysler,Ford,GMC,Honda,Mercedes-Benz,Nissan,Toyota
0,clutch,gear,tran,tran,gear,tranni,gear,tranni,tran
1,turbo,mode,clutch,tranni,tran,tran,electron,tran,tranni
2,mechatron,clutch,gear,transaxl,tranni,clutch,shift,radiat,clutch
3,dsg,engin,tranni,clutch,engin,ntransmiss,conductor,cvt,gear
4,tcm,revers,shifter,ntransmiss,convert,starter,flush,clutch,transaxl
5,mechtron,shift,engin,transmisson,shift,convert,comput,coolant,ecm
6,modul,gearbox,shift,ptu,pinion,compressor,balanc,thermostat,compressor
7,engin,smg,radiat,differenti,clutch,differenti,shifter,motor,convert
8,cvt,sequenti,thermostat,solenoid,cylind,radiat,shaft,differenti,solenoid
9,unit,automat,transaxl,egl,lifter,gear,fluid,turbo,tranmiss
