In [149]:
import pandas as pd
import numpy as np
import re

from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

### Import Data

In [None]:
train = pd.read_csv('train_data.csv')
test = pd.read_csv('test_data.csv')
val = pd.read_csv('val_data.csv')

df = pd.concat([train, val, test], axis=0)

In [151]:
df.head()

Unnamed: 0,review_id,author_id,rating,is_recommended,helpfulness,total_feedback_count,total_neg_feedback_count,total_pos_feedback_count,submission_time,review_text,...,skin_tone,eye_color,skin_type,hair_color,product_id,product_name,brand_name,price_usd,sentiment,date
0,891454,1117870798.0,5,1.0,0.285714,21,15,6,2008-08-29,it is cruel for nature to combine aging and ac...,...,light,,combination,,P7365,Pure Skin Clarifying Dietary Supplement,Murad,50.0,2,2008-08-29
1,24870,1221497549.0,4,1.0,1.0,0,0,0,2008-09-03,love it i had to get used to it not foaming it...,...,light,,dry,,P7880,Soy Hydrating Gentle Face Cleanser,fresh,39.0,2,2008-09-03
2,780337,1229624109.0,5,1.0,1.0,4,0,4,2008-09-04,this product is great i only have to use it oc...,...,mediumTan,,combination,,P122661,7 Day Face Scrub Cream Rinse-Off Formula,CLINIQUE,26.0,2,2008-09-04
3,622360,1526018747.0,5,1.0,0.111111,9,8,1,2008-09-06,this stuff works so well,...,,,,,P2046,Brumisateur Natural Mineral Water Facial Spray...,Evian,23.5,2,2008-09-06
4,478352,54172647.0,5,1.0,1.0,0,0,0,2008-09-11,love it it’s moisturizing and helps keep conce...,...,lightMedium,,combination,,P174502,All About Eyes Rich Eye Cream,CLINIQUE,37.0,2,2008-09-11


In [152]:
df['review_title'].isnull().sum()

15583

In [153]:
df.head()

Unnamed: 0,review_id,author_id,rating,is_recommended,helpfulness,total_feedback_count,total_neg_feedback_count,total_pos_feedback_count,submission_time,review_text,...,skin_tone,eye_color,skin_type,hair_color,product_id,product_name,brand_name,price_usd,sentiment,date
0,891454,1117870798.0,5,1.0,0.285714,21,15,6,2008-08-29,it is cruel for nature to combine aging and ac...,...,light,,combination,,P7365,Pure Skin Clarifying Dietary Supplement,Murad,50.0,2,2008-08-29
1,24870,1221497549.0,4,1.0,1.0,0,0,0,2008-09-03,love it i had to get used to it not foaming it...,...,light,,dry,,P7880,Soy Hydrating Gentle Face Cleanser,fresh,39.0,2,2008-09-03
2,780337,1229624109.0,5,1.0,1.0,4,0,4,2008-09-04,this product is great i only have to use it oc...,...,mediumTan,,combination,,P122661,7 Day Face Scrub Cream Rinse-Off Formula,CLINIQUE,26.0,2,2008-09-04
3,622360,1526018747.0,5,1.0,0.111111,9,8,1,2008-09-06,this stuff works so well,...,,,,,P2046,Brumisateur Natural Mineral Water Facial Spray...,Evian,23.5,2,2008-09-06
4,478352,54172647.0,5,1.0,1.0,0,0,0,2008-09-11,love it it’s moisturizing and helps keep conce...,...,lightMedium,,combination,,P174502,All About Eyes Rich Eye Cream,CLINIQUE,37.0,2,2008-09-11


In [154]:
df.columns

Index(['review_id', 'author_id', 'rating', 'is_recommended', 'helpfulness',
       'total_feedback_count', 'total_neg_feedback_count',
       'total_pos_feedback_count', 'submission_time', 'review_text',
       'review_title', 'skin_tone', 'eye_color', 'skin_type', 'hair_color',
       'product_id', 'product_name', 'brand_name', 'price_usd', 'sentiment',
       'date'],
      dtype='object')

### Drop unnecessary features

In [155]:
to_drop = ['review_id', 'author_id', 'submission_time', 'product_name', 'date',
            'brand_name', 'submission_time','review_title',  'product_id', 'sentiment', "price_usd"]
df.drop(columns=to_drop, inplace=True)

### Encoding

In [156]:
encoder = OrdinalEncoder()
to_encode = ['skin_tone', 'eye_color', 'skin_type', 'hair_color']
df[to_encode] = encoder.fit_transform(df[to_encode])

### Filling missing values

In [157]:
df.isnull().sum()

rating                          0
is_recommended                  0
helpfulness                     0
total_feedback_count            0
total_neg_feedback_count        0
total_pos_feedback_count        0
review_text                   114
skin_tone                    8587
eye_color                   10386
skin_type                    5508
hair_color                  11362
dtype: int64

In [158]:
df = df.dropna(subset=['review_text'])
df = df.fillna(-1)

### Feature Engineering for review_text feature

In [159]:
def clean_text(text):
    if pd.isnull(text):
        return ""
    text = text.lower()                          # Lowercase
    text = re.sub(r'[^a-z0-9\s]', '', text)      # Remove punctuation and special characters
    text = re.sub(r'\s+', ' ', text).strip()     # Remove extra spaces
    return text

df['review_clean'] = df['review_text'].apply(clean_text)
del df['review_text']

In [160]:
vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1,2))
X_review = vectorizer.fit_transform(df['review_clean'])
del df['review_clean']

In [161]:
target_col = 'is_recommended'

X_structured = df.drop(columns=[target_col])
y = df[target_col]
y = y.astype(int)

In [162]:
X_structured.shape

(54620, 9)

### Combine with the rest of the data

In [163]:
X_structured = X_structured.values

In [164]:
X_structured = X_structured.astype('float64')
X_review = X_review.astype('float64')

In [165]:
X = hstack([X_structured, X_review]).toarray()

### Save

In [166]:
np.save('X_final.npy', X)
np.save('y_final.npy', y.values)