In [1]:
import pandas as pd
path = "tripadvisor_european_restaurants2 (1).csv"  

# 2) Load a small sample first (checks that the path is correct)
df_sample = pd.read_csv(path, nrows=5)
print("Columns:", list(df_sample.columns))

# 3) Load full data and display info
df = pd.read_csv(path)
print("\nShape (rows, columns):", df.shape)
print("\nData types:")
print(df.dtypes)

print("\nMissing values per column:")
print(df.isna().sum())


Columns: ['restaurant_name', 'country', 'region', 'city', 'address', 'cuisines', 'avg_rating', 'excellent', 'very_good', 'average', 'poor', 'terrible', 'food', 'service']

Shape (rows, columns): (1048575, 14)

Data types:
restaurant_name     object
country             object
region              object
city                object
address             object
cuisines            object
avg_rating         float64
excellent          float64
very_good          float64
average            float64
poor               float64
terrible           float64
food               float64
service            float64
dtype: object

Missing values per column:
restaurant_name         1
country                 0
region              47505
city               393302
address                 0
cuisines           162735
avg_rating          92200
excellent           90808
very_good           90808
average             90808
poor                90808
terrible            90808
food               463457
service            4

In [2]:
path = "tripadvisor_european_restaurants2 (1).csv"
df = pd.read_csv(path)

# 1) Remove any row that has at least one missing value
df_clean = df.dropna()
# df_clean = df.dropna(subset=["city", "rating"])

print("Original rows:", len(df))
print("After dropna:", len(df_clean))

# 3) Save cleaned dataset
df_clean.to_csv("tripadvisor_clean.csv", index=False)


Original rows: 1048575
After dropna: 328640


In [3]:
df = pd.read_csv("tripadvisor_european_restaurants2 (1).csv")

# Key columns that must not be null
key_cols = ['restaurant_name', 'city', 'country', 'avg_rating']

# Remove rows missing ANY of these key columns
df_clean = df.dropna(subset=key_cols)

print("Original rows:", len(df))
print("After targeted dropna:", len(df_clean))

# Optional: Fill other columns with sensible defaults
df_clean['address'] = df_clean['address'].fillna('Not specified')
df_clean['cuisines'] = df_clean['cuisines'].fillna('Not specified')
df_clean['food'] = df_clean['food'].fillna(0)
df_clean['service'] = df_clean['service'].fillna(0)

# Save cleaned file
df_clean.to_csv("tripadvisor_clean.csv", index=False)
print("\nCleaned file saved: tripadvisor_clean.csv")
print("\nDataset info:")
print(df_clean.info())


Original rows: 1048575
After targeted dropna: 598442


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['address'] = df_clean['address'].fillna('Not specified')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['cuisines'] = df_clean['cuisines'].fillna('Not specified')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['food'] = df_clean['food'].fillna(0)
A value is trying to be s


Cleaned file saved: tripadvisor_clean.csv

Dataset info:
<class 'pandas.core.frame.DataFrame'>
Index: 598442 entries, 0 to 1048574
Data columns (total 14 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   restaurant_name  598442 non-null  object 
 1   country          598442 non-null  object 
 2   region           557328 non-null  object 
 3   city             598442 non-null  object 
 4   address          598442 non-null  object 
 5   cuisines         598442 non-null  object 
 6   avg_rating       598442 non-null  float64
 7   excellent        598442 non-null  float64
 8   very_good        598442 non-null  float64
 9   average          598442 non-null  float64
 10  poor             598442 non-null  float64
 11  terrible         598442 non-null  float64
 12  food             598442 non-null  float64
 13  service          598442 non-null  float64
dtypes: float64(8), object(6)
memory usage: 68.5+ MB
None


In [4]:
df = pd.read_csv("tripadvisor_clean.csv")

# 2) Check duplicates
print("=== DUPLICATE CHECK ===")
print("Total rows:", len(df))
print("Exact duplicate rows:", df.duplicated().sum())

=== DUPLICATE CHECK ===
Total rows: 598442
Exact duplicate rows: 0


In [5]:
# Load the file and check total rows
df = pd.read_csv("tripadvisor_clean.csv")
print("Total rows in tripadvisor_clean.csv:", len(df))

# Or more detailed info
print("\nDataset shape (rows, columns):", df.shape)
print("Column names:", list(df.columns))


Total rows in tripadvisor_clean.csv: 598442

Dataset shape (rows, columns): (598442, 14)
Column names: ['restaurant_name', 'country', 'region', 'city', 'address', 'cuisines', 'avg_rating', 'excellent', 'very_good', 'average', 'poor', 'terrible', 'food', 'service']
