## Import Libraries

In [1]:
import nltk
import pandas as pd
import string


## Load csv files to data frames


In [2]:
#train_df = pd.read_csv('data/train.csv',encoding = "latin1")
#test_df = pd.read_csv('data/test.csv',encoding = "latin1")
#attributes_df = pd.read_csv('data/attributes.csv',encoding = "latin1")
#product_descriptions_df = pd.read_csv('data/product_descriptions.csv',encoding = "latin1")
#sample_submission_df = pd.read_csv('data/sample_submission.csv', encoding = "latin1")
from pyspark.sql import SparkSession
spark  = SparkSession.builder.appName("task14").getOrCreate()
train_df = spark.read.option("header", "true").option("mode", "DROPMALFORMED").csv("data/train.csv")
product_descriptions_df = spark.read.option("header", "true").option("mode", "DROPMALFORMED").csv("data/product_descriptions.csv")

In [3]:
merged_df = train_df.join(product_descriptions_df,'product_uid')
merged_rdd = merged_df.rdd.map(tuple)

In [4]:
print(merged_rdd.first())

('100001', '3', 'Simpson Strong-Tie 12-Gauge Angle', 'l bracket', '2.5', '"Not only do angles make joints stronger, they also provide more consistent, straight corners. Simpson Strong-Tie offers a wide variety of angles in various sizes and thicknesses to handle light-duty jobs or projects where a structural connection is needed. Some can be bent (skewed) to match the project. For outdoor projects or those where moisture is present, use our ZMAX zinc-coated connectors, which provide extra resistance against corrosion (look for a ""Z"" at the end of the model number).Versatile connector for various 90 connections and home repair projectsStronger than angled nailing or screw fastening aloneHelp ensure joints are consistently straight and strongDimensions: 3 in. x 3 in. x 1-1/2 in.Made from 12-Gauge steelGalvanized for extra corrosion resistanceInstall with 10d common nails or #9 x 1-1/2 in. Strong-Drive SD screws"')


In [26]:
#print(train_df.dtypes)

id                 int64
product_uid        int64
product_title     object
search_term       object
relevance        float64
dtype: object


In [28]:
#print(product_descriptions_df.dtypes)

product_uid             int64
product_description    object
dtype: object


In [29]:
#print(product_descriptions_df.head())

   product_uid                                product_description
0       100001  Not only do angles make joints stronger, they ...
1       100002  BEHR Premium Textured DECKOVER is an innovativ...
2       100003  Classic architecture meets contemporary design...
3       100004  The Grape Solar 265-Watt Polycrystalline PV So...
4       100005  Update your bathroom with the Delta Vero Singl...


# Data Cleaning

Download NLTK's stopwords list and WordNetLemmatizer

In [30]:
lemmatizer=nltk.stem.wordnet.WordNetLemmatizer()
stopwords=nltk.corpus.stopwords.words('english')
# nltk.download('stopwords')
# nltk.download('wordnet')


Process and tokenize the raw text by:
    1. Convert to lower case
    2. Remove apostrophe
    3. Remove Punctuation
    4. Lemmatize

In [5]:
def process(text, lemmatizer=nltk.stem.wordnet.WordNetLemmatizer()):
   
    # convert text to lower case
    lowercase = str(text).lower()
    
    #remove 's from string
    apoRemoved = lowercase.replace("'s","")
    
    #convert don't to dont
    apoRemoved = apoRemoved.replace("'","")
    
    #handle other punctuations
    transtable = str.maketrans(string.punctuation,"                                ")
    brokenWords = apoRemoved.translate(transtable)
    
    #convert string to list of words
    listOfWords =  nltk.word_tokenize(brokenWords)
    
    #lemmatize text
    lemmatizedList=[lemmatizer.lemmatize(word) for word in listOfWords]
   
    return lemmatizedList


In [32]:
# test case
text = "Dogs Here's don't cars."
print(process(text))

['dog', 'here', 'dont', 'car']


In [33]:
# process train_df
def process_train_df(df, lemmatizer=nltk.stem.wordnet.WordNetLemmatizer()):
    
    newdf=df
    for i,row in newdf.iterrows():
#         if i>5:
#             break
#         print("raw: ",text)
        newdf.at[i,'product_title'] = process(row['product_title'])
#         print("processed: ",df.iloc[i]['product_title'])
    return newdf


In [34]:
#processed_train_df = process_train_df(train_df)
#print(processed_train_df.head())

   id  product_uid                                      product_title  \
0   2       100001           [simpson, strong, tie, 12, gauge, angle]   
1   3       100001           [simpson, strong, tie, 12, gauge, angle]   
2   9       100002  [behr, premium, textured, deckover, 1, gal, sc...   
3  16       100005  [delta, vero, 1, handle, shower, only, faucet,...   
4  17       100005  [delta, vero, 1, handle, shower, only, faucet,...   

          search_term  relevance  
0       angle bracket       3.00  
1           l bracket       2.50  
2           deck over       3.00  
3    rain shower head       2.33  
4  shower only faucet       2.67  


In [35]:
# process 
def process_product_descriptions_df(df, lemmatizer=nltk.stem.wordnet.WordNetLemmatizer()):
    
    newdf=df
    for i,row in newdf.iterrows():
#         if i>5:
#             break
#         print("raw: ",text)
        newdf.at[i,'product_description'] = process(row['product_description'])
#         print("processed: ",df.iloc[i]['product_title'])
    return newdf

In [36]:
#processed_product_descriptions_df = process_product_descriptions_df(product_descriptions_df)
#print(processed_product_descriptions_df.head())

   product_uid                                product_description
0       100001  [not, only, do, angle, make, joint, stronger, ...
1       100002  [behr, premium, textured, deckover, is, an, in...
2       100003  [classic, architecture, meet, contemporary, de...
3       100004  [the, grape, solar, 265, watt, polycrystalline...
4       100005  [update, your, bathroom, with, the, delta, ver...


In [6]:
processed_rdd = merged_rdd.map(lambda row: (row[0], row[1], process(row[2]), process(row[3]), float(row[4]), process(row[5])))

In [8]:
processed_rdd.first()

('100001',
 '3',
 ['simpson', 'strong', 'tie', '12', 'gauge', 'angle'],
 ['l', 'bracket'],
 2.5,
 ['not',
  'only',
  'do',
  'angle',
  'make',
  'joint',
  'stronger',
  'they',
  'also',
  'provide',
  'more',
  'consistent',
  'straight',
  'corner',
  'simpson',
  'strong',
  'tie',
  'offer',
  'a',
  'wide',
  'variety',
  'of',
  'angle',
  'in',
  'various',
  'size',
  'and',
  'thickness',
  'to',
  'handle',
  'light',
  'duty',
  'job',
  'or',
  'project',
  'where',
  'a',
  'structural',
  'connection',
  'is',
  'needed',
  'some',
  'can',
  'be',
  'bent',
  'skewed',
  'to',
  'match',
  'the',
  'project',
  'for',
  'outdoor',
  'project',
  'or',
  'those',
  'where',
  'moisture',
  'is',
  'present',
  'use',
  'our',
  'zmax',
  'zinc',
  'coated',
  'connector',
  'which',
  'provide',
  'extra',
  'resistance',
  'against',
  'corrosion',
  'look',
  'for',
  'a',
  'z',
  'at',
  'the',
  'end',
  'of',
  'the',
  'model',
  'number',
  'versatile',
  'conn

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

def dummy_fun(doc):
    return doc
def tfidf_sim(row, stopwords=nltk.corpus.stopwords.words('english')):
    title = row[2]
    search = row[3]
    relevance = row[4]
    desc = row[5]
    title_search = []
    title_search.append(title)
    title_search.append(search)
    desc_search = []
    desc_search.append(desc)
    desc_search.append(search)
    vect = TfidfVectorizer(analyzer='word',tokenizer=dummy_fun,preprocessor=dummy_fun,token_pattern=None,stop_words=stopwords,use_idf=False)
    tfidf1 = vect.fit_transform(title_search)
    tfidf2 = vect.fit_transform(desc_search)
    title_sim_value = (tfidf1 * tfidf1.T).A[0,1]
    desc_sim_value = (tfidf2 * tfidf2.T).A[0,1]
    return (row[0],row[1],row[2], row[3], float(title_sim_value), float(desc_sim_value), relevance)

In [12]:
new_rdd = processed_rdd.map(tfidf_sim)

In [13]:
new_rdd.take(2)

[('100001',
  '3',
  ['simpson', 'strong', 'tie', '12', 'gauge', 'angle'],
  ['l', 'bracket'],
  0.0,
  0.0,
  2.5),
 ('100001',
  '2',
  ['simpson', 'strong', 'tie', '12', 'gauge', 'angle'],
  ['angle', 'bracket'],
  0.2886751345948129,
  0.11826247919781652,
  3.0)]

In [15]:
new_df = new_rdd.toDF(["product_id","id","product_title","search_term","title_sim","desc_sim","label"])

In [16]:
new_df.show()

+----------+---+--------------------+--------------------+-------------------+-------------------+-----+
|product_id| id|       product_title|         search_term|          title_sim|           desc_sim|label|
+----------+---+--------------------+--------------------+-------------------+-------------------+-----+
|    100001|  3|[simpson, strong,...|        [l, bracket]|                0.0|                0.0|  2.5|
|    100001|  2|[simpson, strong,...|    [angle, bracket]| 0.2886751345948129|0.11826247919781652|  3.0|
|    100002|  9|[behr, premium, t...|        [deck, over]|                0.0|0.22808577638091165|  3.0|
|    100005| 17|[delta, vero, 1, ...|[shower, only, fa...|0.42640143271122083|0.15339299776947407| 2.67|
|    100005| 16|[delta, vero, 1, ...|[rain, shower, head]|0.17407765595569785|0.06262242910851495| 2.33|
|    100006| 21|[whirlpool, 1, 9,...|         [microwave]| 0.2886751345948129|0.13784910335911552|  3.0|
|    100006| 20|[whirlpool, 1, 9,...|[microwave, over,.

In [17]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=["title_sim","desc_sim"],outputCol="features")
final_df = assembler.transform(new_df).drop('product_id')

In [19]:
final_df.select("features").show(truncate=False)

+-----------------------------------------+
|features                                 |
+-----------------------------------------+
|(2,[],[])                                |
|[0.2886751345948129,0.11826247919781652] |
|[0.0,0.22808577638091165]                |
|[0.42640143271122083,0.15339299776947407]|
|[0.17407765595569785,0.06262242910851495]|
|[0.2886751345948129,0.13784910335911552] |
|[0.2041241452319315,0.09747403576571587] |
|[0.2041241452319315,0.09747403576571587] |
|[0.4472135954999579,0.24077170617153837] |
|[0.5773502691896258,0.27407548393101266] |
|[0.26726124191242434,0.36293309315564887]|
|[0.3721042037676254,0.1900028500641266]  |
|(2,[],[])                                |
|[0.282842712474619,0.7009039702739965]   |
|[0.0,0.14213381090374028]                |
|(2,[],[])                                |
|(2,[],[])                                |
|[0.0,0.07106690545187014]                |
|[0.3481553119113957,0.2057377999494559]  |
|[0.17407765595569785,0.30860669

In [20]:
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(maxIter=5, regParam=0.3, elasticNetParam=0.8)

In [21]:
lrModel = lr.fit(final_df)
print("Coefficients: %s" % str(lrModel.coefficients))
print("Intercept: %s" % str(lrModel.intercept))
trainingSummary = lrModel.summary
print("numIterations: %d" % trainingSummary.totalIterations)
print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
trainingSummary.residuals.show()
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

Coefficients: [0.0,0.0]
Intercept: 2.381798094242814
numIterations: 1
objectiveHistory: [0.4999999999999982]
+--------------------+
|           residuals|
+--------------------+
| 0.11820190575718614|
|  0.6182019057571861|
|  0.6182019057571861|
| 0.28820190575718607|
|-0.05179809424281...|
|  0.6182019057571861|
| 0.28820190575718607|
|  0.6182019057571861|
| 0.28820190575718607|
|  0.6182019057571861|
| 0.28820190575718607|
| 0.28820190575718607|
| 0.28820190575718607|
|  0.6182019057571861|
|-0.05179809424281...|
|-0.05179809424281...|
| -0.7117980942428139|
| -1.3817980942428139|
|-0.05179809424281...|
|-0.05179809424281...|
+--------------------+
only showing top 20 rows

RMSE: 0.534005
r2: 0.000000
