Data Cleaning


In [31]:
import pandas as pd

# Step 1: Load the dataset
df = pd.read_csv('/Users/Sam/Downloads/Recipe Reviews and User Feedback Dataset.csv')

# Step 2: Basic structure and summary of the dataset
print("Dataset Info:")
df.info()

print("\nDataset Description:")
print(df.describe(include='all'))

# Step 3: Check for missing values in each column
missing_values = df.isnull().sum()
print("\nMissing Values Per Column:")
print(missing_values)

# Step 4: Check for duplicate rows
duplicate_rows = df.duplicated().sum()
print(f"\nNumber of Duplicate Rows: {duplicate_rows}")

# Step 5: Handling Missing Values (if necessary)
# Example: Dropping rows with missing values in the 'text' column
df = df.dropna(subset=['text'])

# Optional: Reset the index after dropping rows
df.reset_index(drop=True, inplace=True)

# Step 6: Retain only the earliest created_at entry for duplicate text
# within the same recipe and user
df = df.sort_values(by=['created_at'])  # Sort by created_at (ascending)
df = df.drop_duplicates(subset=['recipe_code', 'user_id', 'text'], keep='first')

# Final Overview of Cleaned Data
print("\nCleaned Dataset Overview:")
df.info()


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18182 entries, 0 to 18181
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Unnamed: 0       18182 non-null  int64 
 1   recipe_number    18182 non-null  int64 
 2   recipe_code      18182 non-null  int64 
 3   recipe_name      18182 non-null  object
 4   comment_id       18182 non-null  object
 5   user_id          18182 non-null  object
 6   user_name        18182 non-null  object
 7   user_reputation  18182 non-null  int64 
 8   created_at       18182 non-null  int64 
 9   reply_count      18182 non-null  int64 
 10  thumbs_up        18182 non-null  int64 
 11  thumbs_down      18182 non-null  int64 
 12  stars            18182 non-null  int64 
 13  best_score       18182 non-null  int64 
 14  text             18180 non-null  object
dtypes: int64(10), object(5)
memory usage: 2.1+ MB

Dataset Description:
          Unnamed: 0  recipe_number    

In [32]:
# Step 1: Drop rows with any missing values
df = df.dropna()

# Step 2: Drop duplicate rows
df = df.drop_duplicates()

# Step 3: Drop columns without proper column names (e.g., unnamed columns)
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

# Final check
print("Final Dataset Shape:", df.shape)
print("Columns after cleaning:", df.columns)


Final Dataset Shape: (17878, 14)
Columns after cleaning: Index(['recipe_number', 'recipe_code', 'recipe_name', 'comment_id', 'user_id',
       'user_name', 'user_reputation', 'created_at', 'reply_count',
       'thumbs_up', 'thumbs_down', 'stars', 'best_score', 'text'],
      dtype='object')


In [33]:
df.head()

Unnamed: 0,recipe_number,recipe_code,recipe_name,comment_id,user_id,user_name,user_reputation,created_at,reply_count,thumbs_up,thumbs_down,stars,best_score,text
544,1,14299,Creamy White Chili,sp_aUSaElGf_14299_c_108956,u_1oKVZzv4THPuCQ2sIogt3sy5uFY,knofflerrocks,1,1613035336,0,0,0,5,100,<p>A favorite of my hubby. He loves the flavor...
14324,68,74724,Chocolate Guinness Cake,sp_aUSaElGf_74724_c_115985,u_1oKVdMwdrvRjIXNElUcpvmOBQSJ,kathydew,1,1613035368,0,0,0,4,100,<p>Any ideas for non-alcoholic substitute for ...
616,1,14299,Creamy White Chili,sp_aUSaElGf_14299_c_117943,u_1oKVggQUdGID7oRqTsDfStoLlTg,rockstaralice,1,1613035422,0,0,0,5,100,<p>Awesome!!!!!!!!!!!!!!!!!!!!!!</p>
6643,19,1693,Moist Chocolate Cake,sp_aUSaElGf_1693_c_111069,u_1oKVmxKa8OSdxKfiCCutej4RTXe,Punkyjoe81,1,1613035477,0,0,0,5,100,<p>ABSOLUTELY DELCIOUS! And so easy to make! ...
7750,25,23222,Baked Mushroom Chicken,sp_aUSaElGf_23222_c_122000,u_1oKVmrpQvTsv8GyS1JiA39K9QuI,ryanswife,1,1613035506,0,0,0,4,100,<p>Chicken is tender and very good. One of my ...


In [34]:
df.describe()

Unnamed: 0,recipe_number,recipe_code,user_reputation,created_at,reply_count,thumbs_up,thumbs_down,stars,best_score
count,17878.0,17878.0,17878.0,17878.0,17878.0,17878.0,17878.0,17878.0,17878.0
mean,39.164224,21786.024947,2.176306,1623727000.0,0.014879,1.107227,0.558564,4.293545,154.022374
std,29.738791,24072.07686,10.096758,5513454.0,0.139129,4.234164,3.498761,1.538917,142.092555
min,1.0,386.0,0.0,1613035000.0,0.0,0.0,0.0,0.0,0.0
25%,12.0,6086.0,1.0,1622717000.0,0.0,0.0,0.0,5.0,100.0
50%,33.0,14600.0,1.0,1622718000.0,0.0,0.0,0.0,5.0,100.0
75%,64.0,32535.0,1.0,1622718000.0,0.0,0.0,0.0,5.0,100.0
max,100.0,191775.0,520.0,1665756000.0,3.0,106.0,126.0,5.0,946.0


EDA

In [35]:
# import matplotlib.pyplot as plt
# import pandas as pd

# # Ensure datetime column is in correct format for time-based plots
# df['created_at'] = pd.to_datetime(df['created_at'], unit='s')

# # Example 1: Distribution of star ratings
# plt.figure(figsize=(8, 5))
# df['stars'].value_counts().sort_index().plot(kind='bar', color='skyblue', edgecolor='black')
# plt.title('Distribution of Star Ratings', fontsize=16)
# plt.xlabel('Star Rating', fontsize=12)
# plt.ylabel('Count', fontsize=12)
# plt.grid(axis='y', linestyle='--', alpha=0.7)
# plt.show()

# # Example 2: Top 10 recipes with the most reviews
# top_recipes = df['recipe_name'].value_counts().head(10)
# plt.figure(figsize=(10, 6))
# top_recipes.plot(kind='bar', color='lightgreen', edgecolor='black')
# plt.title('Top 10 Recipes with Most Reviews', fontsize=16)
# plt.xlabel('Recipe Name', fontsize=12)
# plt.ylabel('Number of Reviews', fontsize=12)
# plt.xticks(rotation=45, ha='right')
# plt.grid(axis='y', linestyle='--', alpha=0.7)
# plt.show()

# # Example 3: Distribution of user reputation
# plt.figure(figsize=(8, 5))
# df['user_reputation'].plot(kind='hist', bins=20, color='coral', edgecolor='black')
# plt.title('Distribution of User Reputation', fontsize=16)
# plt.xlabel('User Reputation', fontsize=12)
# plt.ylabel('Frequency', fontsize=12)
# plt.grid(axis='y', linestyle='--', alpha=0.7)
# plt.show()

# # Example 4: Relationship between thumbs up and star ratings
# plt.figure(figsize=(8, 6))
# plt.scatter(df['stars'], df['thumbs_up'], alpha=0.5, color='purple')
# plt.title('Thumbs Up vs. Star Ratings', fontsize=16)
# plt.xlabel('Star Ratings', fontsize=12)
# plt.ylabel('Thumbs Up', fontsize=12)
# plt.grid(alpha=0.5)
# plt.show()

# # Example 5: Time trend of reviews
# monthly_reviews = df.set_index('created_at').resample('M').size()
# plt.figure(figsize=(12, 6))
# monthly_reviews.plot(color='darkblue', linewidth=2)
# plt.title('Number of Reviews Over Time', fontsize=16)
# plt.xlabel('Date', fontsize=12)
# plt.ylabel('Number of Reviews', fontsize=12)
# plt.grid(alpha=0.5)
# plt.show()


data processing

In [36]:
import re
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
import nltk

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Step 1: Drop rows with missing values in the 'text' column
df = df.dropna(subset=['text'])

# Step 2: Convert all text to lowercase
df['text_cleaned'] = df['text'].str.lower()

# Step 3: Remove punctuation, special characters, and numbers
df['text_cleaned'] = df['text_cleaned'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))

# Step 4: Remove extra whitespace
df['text_cleaned'] = df['text_cleaned'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())

# Step 5: Remove stopwords
stop_words = set(stopwords.words('english'))
df['text_cleaned'] = df['text_cleaned'].apply(lambda x: ' '.join(
    [word for word in word_tokenize(x) if word not in stop_words]
))

# Step 6: Perform lemmatization (returns base word while preserving meaning)
lemmatizer = WordNetLemmatizer()
df['text_lemmatized'] = df['text_cleaned'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(x)]))

# Step 7: Tokenization for Word2Vec
df['tokens'] = df['text_lemmatized'].apply(word_tokenize)

# Step 8: Word2Vec Feature Extraction
# Prepare data for Word2Vec model
sentences = df['tokens'].tolist()  # List of tokenized sentences

# Train a Word2Vec model (you can adjust parameters as needed)
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Generate average Word2Vec vectors for each sentence
def get_avg_word2vec(tokens, model, vector_size):
    """Calculate the average Word2Vec vector for a list of tokens."""
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if vectors:
        return sum(vectors) / len(vectors)
    else:
        return [0] * vector_size

df['word2vec_features'] = df['tokens'].apply(lambda x: get_avg_word2vec(x, word2vec_model, 100))

# Display the cleaned and processed text along with Word2Vec features
print(df[['text', 'text_cleaned', 'text_lemmatized', 'word2vec_features']].head())


[nltk_data] Downloading package stopwords to /Users/sam/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/sam/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/sam/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                                    text  \
544    <p>A favorite of my hubby. He loves the flavor...   
14324  <p>Any ideas for non-alcoholic substitute for ...   
616                 <p>Awesome!!!!!!!!!!!!!!!!!!!!!!</p>   
6643   <p>ABSOLUTELY DELCIOUS!  And so easy to make! ...   
7750   <p>Chicken is tender and very good. One of my ...   

                                            text_cleaned  \
544                pa favorite hubby loves flavor one ip   
14324    pany ideas nonalcoholic substitute beer recipep   
616                                            pawesomep   
6643   pabsolutely delcious easy make made afternoon ...   
7750   pchicken tender good one favorites mom makes o...   

                                         text_lemmatized  \
544                 pa favorite hubby love flavor one ip   
14324     pany idea nonalcoholic substitute beer recipep   
616                                            pawesomep   
6643   pabsolutely delcious easy make 

In [37]:
df.head()

Unnamed: 0,recipe_number,recipe_code,recipe_name,comment_id,user_id,user_name,user_reputation,created_at,reply_count,thumbs_up,thumbs_down,stars,best_score,text,text_cleaned,text_lemmatized,tokens,word2vec_features
544,1,14299,Creamy White Chili,sp_aUSaElGf_14299_c_108956,u_1oKVZzv4THPuCQ2sIogt3sy5uFY,knofflerrocks,1,1613035336,0,0,0,5,100,<p>A favorite of my hubby. He loves the flavor...,pa favorite hubby loves flavor one ip,pa favorite hubby love flavor one ip,"[pa, favorite, hubby, love, flavor, one, ip]","[-0.23809229, 0.2218115, 0.1370357, -0.1680772..."
14324,68,74724,Chocolate Guinness Cake,sp_aUSaElGf_74724_c_115985,u_1oKVdMwdrvRjIXNElUcpvmOBQSJ,kathydew,1,1613035368,0,0,0,4,100,<p>Any ideas for non-alcoholic substitute for ...,pany ideas nonalcoholic substitute beer recipep,pany idea nonalcoholic substitute beer recipep,"[pany, idea, nonalcoholic, substitute, beer, r...","[-0.020233285, 0.22364388, 0.06768603, -0.0038..."
616,1,14299,Creamy White Chili,sp_aUSaElGf_14299_c_117943,u_1oKVggQUdGID7oRqTsDfStoLlTg,rockstaralice,1,1613035422,0,0,0,5,100,<p>Awesome!!!!!!!!!!!!!!!!!!!!!!</p>,pawesomep,pawesomep,[pawesomep],"[0.0035642313, -0.0027186703, -0.008843787, -0..."
6643,19,1693,Moist Chocolate Cake,sp_aUSaElGf_1693_c_111069,u_1oKVmxKa8OSdxKfiCCutej4RTXe,Punkyjoe81,1,1613035477,0,0,0,5,100,<p>ABSOLUTELY DELCIOUS! And so easy to make! ...,pabsolutely delcious easy make made afternoon ...,pabsolutely delcious easy make made afternoon ...,"[pabsolutely, delcious, easy, make, made, afte...","[-0.25375727, 0.32416165, -0.100088604, -0.011..."
7750,25,23222,Baked Mushroom Chicken,sp_aUSaElGf_23222_c_122000,u_1oKVmrpQvTsv8GyS1JiA39K9QuI,ryanswife,1,1613035506,0,0,0,4,100,<p>Chicken is tender and very good. One of my ...,pchicken tender good one favorites mom makes o...,pchicken tender good one favorite mom make oft...,"[pchicken, tender, good, one, favorite, mom, m...","[-0.31730932, 0.16005065, 0.21886005, -0.28350..."


In [38]:
# Step 7: Drop rows where 'text_lemmatized' is empty
df = df[df['text_lemmatized'].str.strip().astype(bool)]

# Final check after cleaning
print("\nFinal Dataset Shape:", df.shape)
print("Preview of Cleaned Data:")
print(df.head())



Final Dataset Shape: (17863, 18)
Preview of Cleaned Data:
       recipe_number  recipe_code              recipe_name  \
544                1        14299       Creamy White Chili   
14324             68        74724  Chocolate Guinness Cake   
616                1        14299       Creamy White Chili   
6643              19         1693     Moist Chocolate Cake   
7750              25        23222   Baked Mushroom Chicken   

                       comment_id                        user_id  \
544    sp_aUSaElGf_14299_c_108956  u_1oKVZzv4THPuCQ2sIogt3sy5uFY   
14324  sp_aUSaElGf_74724_c_115985  u_1oKVdMwdrvRjIXNElUcpvmOBQSJ   
616    sp_aUSaElGf_14299_c_117943  u_1oKVggQUdGID7oRqTsDfStoLlTg   
6643    sp_aUSaElGf_1693_c_111069  u_1oKVmxKa8OSdxKfiCCutej4RTXe   
7750   sp_aUSaElGf_23222_c_122000  u_1oKVmrpQvTsv8GyS1JiA39K9QuI   

           user_name  user_reputation  created_at  reply_count  thumbs_up  \
544    knofflerrocks                1  1613035336            0          0   
14324