In [None]:
# Basic imports and load 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
from pathlib import Path

DATA_PATH = Path('E://datakind_project//datakind_dataset.csv')
df = pd.read_csv(DATA_PATH)
df.head(5)

In [None]:
# Quick info and basic stats (fast)
print('rows,cols:', df.shape)

display(df.select_dtypes(include=[np.number]).describe().transpose())
display(df.select_dtypes(include=[object]).describe().transpose())
# A small describe on object and numeric separatelyprint(df.dtypes)

In [None]:
# Missing / blank-like values summary 
n = len(df)
missing = df.isnull().sum()
missing_pct = (missing / n * 100).round(6)
obj_cols = df.select_dtypes(include='object').columns.tolist()
empty_counts = {}
null_token_counts = {}
tokens = {'nan', 'none', 'null', 'na', ''}
for col in obj_cols:
    s = df[col]
    empty_counts[col] = int(s.apply(lambda x: isinstance(x, str) and x.strip() == '').sum())
    null_token_counts[col] = int(s.dropna().astype(str).str.strip().str.lower().isin(tokens).sum())
summary = pd.DataFrame({
    'dtype': df.dtypes,
    'missing_count': missing,
    'missing_pct': missing_pct,
    'empty_str_count': [empty_counts.get(c, 0) for c in df.columns],
    'null_token_count': [null_token_counts.get(c, 0) for c in df.columns],
})
summary = summary.sort_values('missing_count', ascending=False)
summary.head(40)

In [None]:
# Language distribution for question/answer/tips columns (if present)
lang_cols = [c for c in df.columns if 'lang' in c.lower() or 'language' in c.lower()]
print('Detected language columns:', lang_cols)
for c in lang_cols:
    print('Top values for', c)
    display(df[c].value_counts(dropna=False).head(20))

# If there's an explicit question language column like 'question_lang' or similar, count questions by language
q_lang_candidates = [c for c in df.columns if 'question' in c and ('lang' in c or 'language' in c)]
if q_lang_candidates:
    qlc = q_lang_candidates[0]
    print('Question counts by language (top):')
    display(df[qlc].value_counts().head(20))

# Fallback: try to detect language column 'language' or 'lang'
if not q_lang_candidates and lang_cols:
    display(df[lang_cols[0]].value_counts().head(20))

In [None]:
# Demographics overview (age, gender, country indicators if present)
demo_cols = [c for c in df.columns if any(k in c.lower() for k in ['age','gender','sex','country','region','district'])]
print('Detected demographic-like columns:', demo_cols)
for c in demo_cols:
    if df[c].dtype == 'object':
        print('Top categories for', c)
        display(df[c].value_counts(dropna=False).head(20))
    else:
        print('Summary for', c)
        display(df[c].describe())

In [None]:
# Identify power users (by answers given and questions asked).
from pathlib import Path
import csv

# Canonical column names (as provided)
asker_col = 'question_user_id'
answerer_col = 'response_user_id'
question_ts_col = 'question_sent'
response_ts_col = 'response_sent'
question_text_col = 'question_content'
answer_text_col = 'response_content'
language_col = 'question_language'
country_col = 'question_user_country_code'
topic_q_col = 'question_topic'
topic_r_col = 'response_topic'

# Verify required columns exist; if not, stop and ask for correction
missing = [c for c in [asker_col, answerer_col, question_text_col, answer_text_col] if c not in df.columns]
if missing:
    print('The following required columns are missing from the dataframe:', missing)
    print('Please provide the correct column names. I will not attempt fuzzy detection when you supplied names.')
else:
    top_n = 20
    # Top askers
    top_askers = df[asker_col].value_counts().head(top_n)
    print(f'Top {top_n} askers (by question count):')
    display(top_askers)

    # Top answerers
    top_answerers = df[answerer_col].value_counts().head(top_n)
    print(f'Top {top_n} answerers (by response count):')
    display(top_answerers)

    # Save CSV of top users for manual review
    rows = []
    for user, cnt in top_askers.items():
        rows.append({'user_id': str(user), 'role': 'asker', 'count': int(cnt)})
    for user, cnt in top_answerers.items():
        rows.append({'user_id': str(user), 'role': 'answerer', 'count': int(cnt)})
    out_path = Path('challenge_1/top_power_users.csv')
    out_path.parent.mkdir(parents=True, exist_ok=True)
    with out_path.open('w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=['user_id','role','count'])
        writer.writeheader()
        writer.writerows(rows)
    print('Saved top power users to', out_path)

In [None]:
# Most talked-about topics overall and by country.
topic_cols = [c for c in df.columns if any(k in c.lower() for k in ['topic','category','tag','theme'])]
print('Topic-like columns detected:', topic_cols)
country_cols = [c for c in df.columns if 'country' in c.lower()]
country_col = country_cols[0] if country_cols else None
if topic_cols:
    t = topic_cols[0]
    print('Top topics overall:')
    display(df[t].value_counts().head(30))
    if country_col is not None:
        print(f'Top topics by country (sample for top 5 countries) grouping by {country_col}:')
        top_countries = df[country_col].value_counts().head(5).index.tolist()
        for ctry in top_countries:
            print('---', ctry)
            display(df[df[country_col] == ctry][t].value_counts().head(10))
else:
    print('No clear topic column detected. Consider running keyword extraction on `question` text to create topics.')

In [None]:
# Parse object date/time columns into datetime (UTC) and create *_dt columns.
# This cell will populate dt_cols and set a preferred date_col for downstream cells.
obj_cols = df.select_dtypes(include=['object']).columns.tolist()

date_keywords = ['sent', 'created_at', '_at', 'date', 'dt', 'time']
cand_cols = [c for c in obj_cols if any(k in c.lower() for k in date_keywords)]

dt_cols = []
for col in cand_cols:
    # try to parse with pandas (coerce errors). Keep timezone info if present, convert to UTC.
    parsed = pd.to_datetime(df[col], utc=True, errors='coerce', infer_datetime_format=True)
    parsed_count = int(parsed.notna().sum())
    if parsed_count > 0:
        new_col = f"{col.rstrip('_').rstrip('.')}_dt" if not col.endswith('_dt') else col
        # avoid overwriting if exists but replace if present
        df[new_col] = parsed
        dt_cols.append(new_col)
        print(f"Parsed column '{col}' -> '{new_col}': {parsed_count}/{len(df)} non-null (sample values):")
        display(df[new_col].dropna().head(5))

# Deduplicate dt_cols and prefer canonical names
dt_cols = list(dict.fromkeys(dt_cols))
print("Detected/created datetime columns:", dt_cols)

# Set a preferred date_col for activity plots and other analysis
preferred_candidates = ['question_sent_dt','question_dt','response_sent_dt','response_dt']
date_col = next((c for c in preferred_candidates if c in df.columns), dt_cols[0] if dt_cols else None)
print("Using date column for downstream analysis:", date_col)

# Quick sanity checks for main columns expected by the notebook
for expected in [question_ts_col, response_ts_col, 'question_user_created_at', 'response_user_created_at']:
    if expected in df.columns:
        parsed = pd.to_datetime(df[expected], utc=True, errors='coerce', infer_datetime_format=True)
        non_null = int(parsed.notna().sum())
        print(f"Sanity parse: '{expected}' -> {non_null}/{len(df)} parsed")

In [None]:
# Show up to 30 rows where the requested columns are all non-null
required_cols = [
    'response_sent',
    'response_sent_dt',
    'question_sent',
    'question_sent_dt',
    'response_user_created_at_dt',
    'response_user_created_at',
    'question_user_created_at_dt',
                

In [None]:
# Frequency over time for question/response sent times and user-created times (monthly)
cols_map = {
    'question_sent_dt': 'Questions (sent)',
    'question_user_created_at_dt': 'Askers (account created)',
    'response_sent_dt': 'Responses (sent)',
    'response_user_created_at_dt': 'Responders (account created)'
}

# Keep only columns that exist in the dataframe
cols_present = {k: v for k, v in cols_map.items() if k in df.columns}
if not cols_present:
    print("No datetime columns found among:", list(cols_map.keys()))
else:
    # Build monthly counts (Period M -> Timestamp for plotting)
    counts = {}
    for col, label in cols_present.items():
        s = df[col].dropna()
        # ensure datetime dtype
        s = pd.to_datetime(s, utc=True, errors='coerce').dropna()
        if len(s):
            ser = s.dt.to_period('M').value_counts().sort_index()
            ser.index = ser.index.to_timestamp()
            counts[label] = ser
        else:
            counts[label] = pd.Series(dtype='int64')

    # Combine into single DataFrame, fill missing months with 0
    counts_df = pd.DataFrame(counts).fillna(0).astype(int)
    counts_df = counts_df.sort_index()

    # Quick summary: total counts and top months
    print("Total counts (available columns):")
    print(counts_df.sum().to_frame('total_count'))
    print("\nTop 5 months per series:")
    for col in counts_df.columns:
        top = counts_df[col].nlargest(5)
        print(f"\n{col}:")
        print(top)

    # Plot monthly time series (all series on same plot)
    plt.figure(figsize=(12,5))
    for col in counts_df.columns:
        sns.lineplot(x=counts_df.index, y=counts_df[col], label=col)
    plt.title('Monthly frequency: questions & responses (sent vs user_created)')
    plt.xlabel('Month')
    plt.ylabel('Count')
    plt.legend()
    plt.tight_layout()
    plt.show()

    # Optionally show weekly aggregation for a denser trend (uncomment to use)
    # counts_w = {}
    # for col, label in cols_present.items():
    #     s = pd.to_datetime(df[col], utc=True, errors='coerce').dropna()
    #     counts_w[label] = s.dt.to_period('W').value_counts().sort_index()
    # for k in counts_w:
    #     counts_w[k].index = counts_w[k].index.to_timestamp()
    # counts_w_df = pd.DataFrame(counts_w).fillna(0).astype(int).sort_index()
    # display(counts_w_df.head())