In [21]:
!pip install -q streamlit requests pandas trafilatura cloudscraper seaborn pyngrok

In [24]:
%%writefile app.py
import streamlit as st
import pandas as pd
import requests
import time
import trafilatura
import cloudscraper
import re
import matplotlib.pyplot as plt
import seaborn as sns
import math
from datetime import datetime, timedelta
from pathlib import Path

# --- 1. Load Colab Secrets ---
def get_colab_secrets():
    try:
        from google.colab import userdata
        id_val = userdata.get('ADZUNA_APP_ID')
        key_val = userdata.get('ADZUNA_APP_KEY')
        return id_val, key_val
    except:
        return "", ""

secret_id, secret_key = get_colab_secrets()

# --- 2. Configuration & UI Setup ---
st.set_page_config(page_title="Job Skill Analyzer", layout="wide")
st.title("üéØ Job Skill Analyzer: Data Science & Analytics")
st.markdown("Scan and analyze real-time job market requirements via Adzuna API.")

# --- 3. Sidebar Setup ---
with st.sidebar:
    st.header("üîë API Credentials")
    app_id = st.text_input("Adzuna APP ID", value=secret_id if secret_id else "")
    app_key = st.text_input("Adzuna APP KEY", type="password", value=secret_key if secret_key else "")

    st.header("üîç Search Filters")
    country = st.selectbox("Country", ["us", "uk", "ca", "au"], index=0)
    city = st.text_input("City", value="Baltimore")
    keywords_input = st.text_input("Keywords (Comma separated)", "data scientist, data analyst")
    keywords = [k.strip() for k in keywords_input.split(",")]

    st.header("‚öôÔ∏è Settings")
    days_back = st.sidebar.slider("Days Back", 7, 90, 60)
    max_pages = st.sidebar.number_input("Max Pages", 1, 10, 5) # Default to 5 to match original script

# --- 4. Synchronized Title Filtering Rules ---
TARGET_TITLE_KEYWORDS = ["data scientist", "senior data scientist", "sr. data scientist", "lead data scientist", "principal data scientist", "staff data scientist", "data analyst", "senior data analyst"]
EXCLUDED_TITLE_KEYWORDS = ["architect", "engineer", "manager", "consultant", "director", "vp", "vice president", "head of", "marketing", "sales", "product"]

# --- 5. Synchronized Skill Taxonomy ---
SKILL_KEYWORDS = {
    'Programming': ['Python', 'SQL', ' R ', 'SAS', 'Stata', 'Julia', 'C++', 'Java', 'Scala', 'Go', 'Bash', 'Shell'],
    'Cloud & Big Data': ['AWS', 'Azure', 'GCP', 'Google Cloud', 'Snowflake', 'Databricks', 'Spark', 'Hadoop', 'Kafka', 'Redshift', 'BigQuery', 'Athena', 'Glue', 'Terraform', 'Airflow'],
    'Databases': ['PostgreSQL', 'MySQL', 'MongoDB', 'NoSQL', 'SQL Server', 'Oracle', 'Cassandra'],
    'ML & AI': ['Machine Learning', 'Deep Learning', 'Reinforcement Learning', 'NLP', 'Natural Language Processing', 'Computer Vision', 'Generative AI', 'GenAI', 'LLM', 'GPT', 'Scikit-learn', 'TensorFlow', 'PyTorch', 'Keras', 'XGBoost', 'LightGBM', 'CatBoost'],
    'Stats & Research': ['Statistics', 'statisti', 'Biostatistics', 'Causal Inference', 'Epidemiology', 'Econometrics', 'Bayesian', 'Survival Analysis', 'Longitudinal', 'Time Series', 'A/B Testing', 'Experimental Design', 'Propensity Score', 'Clinical Trials', 'Regression', 'Hypothesis Testing', 'RCT', 'GIS', 'Geographic Information Systems', 'Spatial'],
    'Visualization & BI': ['Tableau', 'Power BI', 'Looker', 'Qlik', 'Matplotlib', 'Seaborn', 'Plotly', 'Shiny', 'D3.js'],
    'Engineering & DevOps': ['Git', 'GitHub', 'CI/CD', 'Docker', 'Kubernetes', 'MLOps', 'Agile', 'Scrum', 'DevOps'],
    'Degree': ['PhD', 'Ph.D.', 'Doctorate', 'Master', 'M.S.', 'MSc', 'MPH', 'MBA', 'Bachelor']
}

def is_relevant_title(title):
    if not title: return False
    t = title.lower()
    has_target = any(k in t for k in TARGET_TITLE_KEYWORDS)
    has_excluded = any(k in t for k in EXCLUDED_TITLE_KEYWORDS)
    return has_target and not has_excluded

def find_skills_refined(text):
    if pd.isna(text): return []
    res = []
    for cat, kws in SKILL_KEYWORDS.items():
        for k in kws:
            pattern = r'\b' + re.escape(k.strip()) + r'\b'
            if re.search(pattern, text, re.I):
                val = k.strip()
                if val == 'Ph.D.': val = 'PhD'
                if val == 'M.S.': val = 'Master'
                res.append(val)
    return list(set(res))

def fetch_full_description(url):
    scraper = cloudscraper.create_scraper(browser={'browser': 'chrome', 'platform': 'windows', 'desktop': True})
    try:
        # Retry logic for anti-403
        for _ in range(2):
            response = scraper.get(url, timeout=20)
            if response.status_code == 200:
                content = trafilatura.extract(response.text, include_tables=True)
                if content and len(content) > 300:
                    return content.strip()
            time.sleep(1.5)
    except: pass
    return None

# --- 6. Execution Logic ---
if st.sidebar.button("üöÄ Start Analysis"):
    if not app_id or not app_key:
        st.error("Please provide Adzuna API Credentials.")
    else:
        all_jobs = []
        cutoff_date = datetime.now() - timedelta(days=days_back)
        base_url = f"https://api.adzuna.com/v1/api/jobs/{country}/search"

        status_text = st.empty()
        progress_bar = st.progress(0)
        total_steps = len(keywords) * max_pages
        current_step = 0

        for kw in keywords:
            for page in range(1, max_pages + 1):
                current_step += 1
                status_text.info(f"Fetching: {kw} (Page {page}/{max_pages})...")
                progress_bar.progress(current_step / total_steps)

                params = {"app_id": app_id, "app_key": app_key, "what": kw, "where": city, "results_per_page": 50, "sort_by": "date"}
                try:
                    r = requests.get(f"{base_url}/{page}", params=params)
                    data = r.json().get("results", [])
                    if not data: break

                    for job in data:
                        created_dt = datetime.fromisoformat(job.get("created").replace("Z", ""))
                        if created_dt < cutoff_date: continue

                        title = job.get("title")
                        if not is_relevant_title(title): continue

                        # Fetch full description - CRITICAL FOR SYNCING COUNTS
                        st.write(f"üîç Analyzing: {title[:40]}...")
                        full_desc = fetch_full_description(job.get("redirect_url"))
                        final_description = full_desc if full_desc else job.get("description", "")

                        all_jobs.append({
                            "Title": title,
                            "Company": job.get("company", {}).get("display_name"),
                            "Location": job.get("location", {}).get("display_name"),
                            "Created": job.get("created"),
                            "Description": final_description,
                            "URL": job.get("redirect_url")
                        })
                        time.sleep(1)
                except Exception as e:
                    st.warning(f"API Request failed: {e}")
                    break

        if all_jobs:
            # Sync Raw Total: Removed .drop_duplicates() to match script 2
            df = pd.DataFrame(all_jobs)
            df['found_skills_list'] = df['Description'].apply(find_skills_refined)
            df_effective = df[df['found_skills_list'].map(len) > 0].copy()
            n_sample = len(df_effective)

            st.success(f"Analysis Complete: Found {len(df)} relevant jobs (N={n_sample} with skill data).")

            st.subheader("üìã Filtered Job List")
            st.dataframe(df[['Title', 'Company', 'Location', 'Created']].head(20))

            # --- Visualizations ---
            st.subheader("üìä Skill Market Share")
            all_found = []
            for _, row in df_effective.iterrows():
                for s in row['found_skills_list']:
                    for cat, ks in SKILL_KEYWORDS.items():
                        norm_ks = [k.strip().replace('Ph.D.', 'PhD').replace('M.S.', 'Master') for k in ks]
                        if s in norm_ks:
                            all_found.append({'Category': cat, 'Skill': s})
                            break

            stats_df = pd.DataFrame(all_found)
            categories = list(SKILL_KEYWORDS.keys())
            cols_num = 2
            rows_num = math.ceil(len(categories) / cols_num)

            fig, axes = plt.subplots(rows_num, cols_num, figsize=(16, rows_num * 5))
            axes = axes.flatten()
            palette = sns.color_palette("viridis", len(categories))

            for i, cat in enumerate(categories):
                cat_data = stats_df[stats_df['Category'] == cat]
                ax = axes[i]
                if not cat_data.empty:
                    counts = cat_data['Skill'].value_counts().reset_index()
                    counts.columns = ['Skill', 'Count']
                    sns.barplot(data=counts, x='Count', y='Skill', ax=ax, color=palette[i])
                    ax.set_title(f'Category: {cat}', fontsize=14, fontweight='bold', color='#2c3e50')
                    ax.set_xlim(0, n_sample + 1)
                    for p in ax.patches:
                        w = int(p.get_width())
                        ax.annotate(f'{w} ({(w/n_sample)*100:.1f}%)', (w, p.get_y() + p.get_height()/2),
                                    xytext=(8, 0), textcoords='offset points', fontweight='bold')
                else:
                    ax.text(0.5, 0.5, 'No matches', ha='center', va='center', color='gray')
                    ax.set_title(f'Category: {cat} (Empty)', fontsize=14, color='gray')

            for j in range(i + 1, len(axes)):
                fig.delaxes(axes[j])

            # Synchronized Suptitle and Logic
            plt.suptitle(f'Market Analysis: Data Scientist & Data Analyst in {city}\n(Effective N = {n_sample} Jobs with Skills | Raw Total = {len(df)})',
                         fontsize=22, fontweight='bold', y=0.98, color='#1a2a6c')

            plt.tight_layout(rect=[0, 0.03, 1, 0.95])
            st.pyplot(fig)

            st.download_button("üì• Download Results (CSV)", data=df.to_csv(index=False).encode('utf-8-sig'), file_name="jobs_market_analysis.csv", mime="text/csv")
        else:
            st.warning("No jobs matched your filters. Try increasing 'Days Back' or pages.")

Overwriting app.py


In [27]:
# 2. Ë®≠ÂÆö Token (ÊääÂºïËôüÂÖßÊèõÊàê‰Ω†ÁöÑ)
from pyngrok import ngrok
ngrok.set_auth_token("2uJIzWRcCi2AQTsnJHBejqplPK6_5y9YJrZUm2MafCA2Nffym")

# Start Streamlit server on a specific port
!nohup streamlit run app.py --server.port 5011 &

# Start ngrok tunnel to expose the Streamlit server
ngrok_tunnel = ngrok.connect(addr='5011', proto='http', bind_tls=True)

# Print the URL of the ngrok tunnel
print(' * Tunnel URL:', ngrok_tunnel.public_url)

nohup: appending output to 'nohup.out'
 * Tunnel URL: https://39607e84ff94.ngrok-free.app
