
# College Event Feedback Analysis – Notebook

**Source file:** `student_feedback.csv`

This notebook loads the survey data, cleans it, computes rating summaries, performs a lightweight lexicon-based sentiment analysis on the free-text feedback, and generates plots. It concludes with auto-generated recommendations for organizers.

> Note: The sentiment model is a compact rule-based approach (offline-friendly) and is intended for quick diagnostics. For production/academic use, consider VADER/TextBlob with proper lexicons.


In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv(r"/mnt/data/student_feedback.csv")
df.columns = [str(c).strip() for c in df.columns]
df.head()


In [None]:

def detect_text_col(df):
    candidates = [c for c in df.columns if any(k in c.lower() for k in ["feedback","comment","suggestion","remarks","review","what did","improve"])]
    if candidates:
        lens = {c: df[c].astype(str).str.len().mean() for c in candidates}
        return max(lens, key=lens.get)
    obj_cols = [c for c in df.columns if df[c].dtype == 'object']
    if obj_cols:
        lens = {c: df[c].astype(str).str.len().mean() for c in obj_cols}
        return max(lens, key=lens.get)
    return None

numeric_cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])]
keyword_candidates = [c for c in df.columns if any(k in c.lower() for k in ["rating","satisfaction","organis","content","speaker","overall","recommend","experience","session","timing","management","audio","visual","venue","food"])]
for c in keyword_candidates:
    if c not in numeric_cols:
        coerced = pd.to_numeric(df[c], errors='coerce')
        if coerced.notna().mean() > 0.6:
            df[c] = coerced
            numeric_cols.append(c)

text_col = detect_text_col(df)
numeric_cols, text_col


In [None]:

df_clean = df.copy()
if text_col:
    df_clean[text_col] = df_clean[text_col].astype(str).str.strip()
    df_clean[text_col] = df_clean[text_col].replace({'': np.nan, 'nan': np.nan, 'NaN': np.nan})
df_clean = df_clean.drop_duplicates()

for c in numeric_cols:
    col = df_clean[c]
    if col.dropna().between(1,5).mean() > 0.7:
        df_clean.loc[~col.between(1,5), c] = np.nan
    elif col.dropna().between(0,10).mean() > 0.7:
        df_clean.loc[~col.between(0,10), c] = np.nan
df_clean.head()


In [None]:

positive_words = set("""
amazing awesome great good excellent fantastic helpful friendly engaging informative
enjoyed love loved enjoyable outstanding superb wonderful inspiring impressive
smooth organized well-organized punctual fun entertaining valuable insightful
""".split())

negative_words = set("""
bad poor boring disappointed disappointing delay delayed late noisy loud
confusing unclear unorganized unorganised disorganized disorganised messy
crowded mismanaged terrible awful worst horrible long wait waiting queue queues
overpriced costly expensive dull irrelevant content short time limited
rude unhelpful
""".split())

def simple_sentiment(text):
    if not isinstance(text, str) or not text.strip():
        return np.nan
    tokens = [t.strip(".,!?;:()[]{}'\"").lower() for t in text.split()]
    pos = sum(1 for t in tokens if t in positive_words)
    neg = sum(1 for t in tokens if t in negative_words)
    score = pos - neg
    if score > 0:
        return "positive"
    elif score < 0:
        return "negative"
    else:
        return "neutral"

if text_col:
    df_clean['Sentiment'] = df_clean[text_col].apply(simple_sentiment)

sentiment_counts = df_clean['Sentiment'].value_counts(dropna=True) if 'Sentiment' in df_clean else None
sentiment_counts


In [None]:

rating_summary = None
if numeric_cols:
    rating_summary = pd.DataFrame({
        'Metric': numeric_cols,
        'Count': [df_clean[c].notna().sum() for c in numeric_cols],
        'Mean': [df_clean[c].mean() for c in numeric_cols],
        'Median': [df_clean[c].median() for c in numeric_cols],
        'StdDev': [df_clean[c].std() for c in numeric_cols],
    }).sort_values('Mean', ascending=False)
rating_summary


In [None]:

import matplotlib.pyplot as plt

if rating_summary is not None and not rating_summary.empty:
    plt.figure()
    plt.bar(rating_summary['Metric'].astype(str), rating_summary['Mean'])
    plt.title('Average Ratings by Metric')
    plt.xlabel('Metric')
    plt.ylabel('Average Score')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

    for c in rating_summary['Metric'].astype(str).tolist()[:6]:
        if pd.api.types.is_numeric_dtype(df_clean[c]):
            plt.figure()
            plt.hist(df_clean[c].dropna())
            plt.title(f'Distribution of Ratings: {c}')
            plt.xlabel('Score')
            plt.ylabel('Frequency')
            plt.tight_layout()
            plt.show()

if 'Sentiment' in df_clean:
    counts = df_clean['Sentiment'].value_counts().reindex(['positive','neutral','negative']).fillna(0).astype(int)
    if counts.sum() > 0:
        plt.figure()
        plt.pie(counts.values, labels=counts.index, autopct='%1.1f%%')
        plt.title('Sentiment Distribution (Feedback Text)')
        plt.tight_layout()
        plt.show()


In [None]:

recommendations = []
if rating_summary is not None and not rating_summary.empty:
    low_metrics = rating_summary[rating_summary['Mean'] < (rating_summary['Mean'].max() - 0.75)]
    for _, row in low_metrics.iterrows():
        metric = row['Metric']
        mean = row['Mean']
        recommendations.append(f"Improve '{metric}': current average {mean:.2f}. Target quick wins before the next event.")

    overall_like = [m for m in rating_summary['Metric'] if 'overall' in str(m).lower() or 'satisfaction' in str(m).lower()]
    for m in overall_like:
        mean = df_clean[m].mean()
        if mean and mean < 4:
            recommendations.append(f"Overall satisfaction for '{m}' is {mean:.2f}. Run a structured post-mortem and prioritize top 3 fixes.")

if 'Sentiment' in df_clean:
    from collections import Counter
    def extract_keywords(series, focus_set):
        tokens = []
        for t in series.fillna('').astype(str):
            toks = [tok.strip(".,!?;:()[]{}'\"").lower() for tok in t.split()]
            tokens.extend([tok for tok in toks if tok in focus_set])
        return Counter(tokens).most_common(5)

    concern_words = set(['timing','late','delay','queue','sound','audio','mic','noise','crowd','food','registration','seating','speaker','content','organization','management','venue','wifi','projector','lighting'])
    highlight_words = set(['speaker','workshop','networking','games','music','prizes','sessions','organization','volunteers','food'])

    neg_texts = df_clean.loc[df_clean['Sentiment']=='negative', text_col] if text_col else pd.Series(dtype=str)
    pos_texts = df_clean.loc[df_clean['Sentiment']=='positive', text_col] if text_col else pd.Series(dtype=str)

    neg_keys = extract_keywords(neg_texts, concern_words)
    pos_keys = extract_keywords(pos_texts, highlight_words)

    if neg_keys:
        joined = '; '.join([f"{w} (x{c})" for w,c in neg_keys])
        recommendations.append(f"Address frequent concerns: {joined}. Assign owners and deadlines.")

    if pos_keys:
        joined = '; '.join([f"{w} (x{c})" for w,c in pos_keys])
        recommendations.append(f"Double down on what worked: {joined}. Preserve these strengths.")

if not recommendations:
    recommendations.append("Maintain strengths and run a pre-event pulse survey to identify top 2–3 improvements.")

recommendations



### Proxy Sentiment from Ratings
No free-text feedback was available, so we derived a **proxy sentiment** from the average of all rating metrics per response:
- **Positive**: average ≥ 4.0 (on a 1–5 scale) or ≥ 8.0 (on a 0–10 scale)
- **Neutral**: average ≥ 3.0 (1–5) or ≥ 5.0 (0–10) and below positive threshold
- **Negative**: below neutral threshold


In [None]:

row_avg = df_clean[numeric_cols].mean(axis=1)
scale_1_5 = sum(df_clean[c].dropna().between(1,5).mean() for c in numeric_cols) / max(len(numeric_cols),1) > 0.5

if scale_1_5:
    pos_thresh, neu_thresh = 4.0, 3.0
else:
    pos_thresh, neu_thresh = 8.0, 5.0

proxy_sentiment = np.where(row_avg >= pos_thresh, "positive",
                   np.where(row_avg >= neu_thresh, "neutral", "negative"))
df_clean["ProxySentimentFromRatings"] = proxy_sentiment
df_clean["ProxySentimentFromRatings"].value_counts()
