In [None]:
import pandas as pd

# Load dataset
df = pd.read_csv("/content/student_scores.csv")

# Remove duplicates
df_clean = df.drop_duplicates(subset="Name", keep="first").copy()

# Fill missing values using .loc (no warnings)
df_clean.loc[:, "Math"] = df_clean["Math"].fillna(df_clean["Math"].median())
df_clean.loc[:, "Physics"] = df_clean["Physics"].fillna(df_clean["Physics"].median())
df_clean.loc[:, "Chemistry"] = df_clean["Chemistry"].fillna(df_clean["Chemistry"].median())

# Add StudentID
df_clean.loc[:, "StudentID"] = range(1, len(df_clean) + 1)

# Total and Average
df_clean.loc[:, "TotalMarks"] = df_clean[["Math", "Physics", "Chemistry"]].sum(axis=1)
df_clean.loc[:, "AverageMarks"] = df_clean["TotalMarks"] / 3

# Normalization
df_clean.loc[:, "Math_Norm"] = (df_clean["Math"] - df_clean["Math"].min()) / (df_clean["Math"].max() - df_clean["Math"].min())
df_clean.loc[:, "Physics_Norm"] = (df_clean["Physics"] - df_clean["Physics"].min()) / (df_clean["Physics"].max() - df_clean["Physics"].min())
df_clean.loc[:, "Chemistry_Norm"] = (df_clean["Chemistry"] - df_clean["Chemistry"].min()) / (df_clean["Chemistry"].max() - df_clean["Chemistry"].min())

# Pivot table (subject-wise averages grouped by Name)
pivot_table = df_clean.pivot_table(
    values=["Math", "Physics", "Chemistry"],
    index="Name",
    aggfunc="mean"
)

print(pivot_table)


         Chemistry  Math  Physics
Name                             
Alice         88.0  78.0     85.0
Bob           88.0  82.0     84.5
Charlie       91.0  75.0     79.0
David         87.0  80.0     92.0
Eve           88.0  85.0     84.0


In [None]:
import pandas as pd

# ---------------------------------------------------
# 1. Load Dataset
# ---------------------------------------------------

df = pd.read_csv("/content/movie_reviews.csv")

print("Original Data:")
print(df)

# ---------------------------------------------------
# 2. Cleaning and Preprocessing
# ---------------------------------------------------

# Remove duplicate reviews based on UserID and MovieTitle
df_cleaned = df.drop_duplicates(subset=["UserID", "MovieTitle"], keep="first")

# Handle missing ratings by filling with average rating per movie
df_cleaned["Rating"] = df_cleaned.groupby("MovieTitle")["Rating"].transform(
    lambda x: x.fillna(x.mean())
)

# Strip whitespace and convert review text to lowercase
df_cleaned["ReviewText"] = (
    df_cleaned["ReviewText"]
    .str.strip()
    .str.lower()
)

print("\nCleaned Data:")
print(df_cleaned)

# ---------------------------------------------------
# 3. Aggregation
# ---------------------------------------------------

# Group by movie and compute statistics
movie_stats = df_cleaned.groupby("MovieTitle")["Rating"].agg(
    ["mean", "median", "count"]
)

print("\nMovie Rating Statistics:")
print(movie_stats)

# Identify top-rated movie
top_movie = movie_stats["mean"].idxmax()
print("\nTop Rated Movie:", top_movie)

# ---------------------------------------------------
# 4. Text Analysis
# ---------------------------------------------------

# Count how many times the word "amazing" appears
amazing_count = df_cleaned["ReviewText"].str.count("amazing").sum()

print("\nTotal occurrences of 'amazing':", amazing_count)

# Filter reviews where rating > 4.0 and text contains "plot"
high_rating_plot_reviews = df_cleaned[
    (df_cleaned["Rating"] > 4.0) &
    (df_cleaned["ReviewText"].str.contains("plot"))
]

print("\nReviews with rating > 4.0 containing 'plot':")
print(high_rating_plot_reviews)


Original Data:
  UserID    MovieTitle  Rating               ReviewText
0   U001     Inception     4.5           Amazing movie!
1   U002     Inception     4.0               brilliant 
2   U001     Inception     4.5           Amazing movie!
3   U003  Interstellar     5.0            Mind-blowing!
4   U004  Interstellar     NaN     Outstanding visuals.
5   U005     Inception     4.2  Good plot but confusing

Cleaned Data:
  UserID    MovieTitle  Rating               ReviewText
0   U001     Inception     4.5           amazing movie!
1   U002     Inception     4.0                brilliant
3   U003  Interstellar     5.0            mind-blowing!
4   U004  Interstellar     5.0     outstanding visuals.
5   U005     Inception     4.2  good plot but confusing

Movie Rating Statistics:
                  mean  median  count
MovieTitle                           
Inception     4.233333     4.2      3
Interstellar  5.000000     5.0      2

Top Rated Movie: Interstellar

Total occurrences of 'amazing': 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned["Rating"] = df_cleaned.groupby("MovieTitle")["Rating"].transform(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned["ReviewText"] = (
