In [None]:
# app.py
import time, random, os, json
import requests, pandas as pd
from datetime import datetime, timedelta
from flask import Flask, request, render_template_string, send_file
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import plotly.express as px
import plotly

app = Flask(__name__)

HTML = """
<!DOCTYPE html>
<html lang="en">
<head>
  <meta charset="UTF-8">
  <title>Real-Time Job Trend Analyzer</title>
  <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
  <style>
    body { font-family: Arial; max-width: 1000px; margin: 20px auto; }
    form { margin-bottom: 20px; }
    input, button { padding: 8px; font-size: 1rem; }
    .chart { margin-top: 30px; }
    table { width: 100%; border-collapse: collapse; margin-top: 30px; }
    th, td { border: 1px solid #ccc; padding: 6px; text-align: left; }
    th { background: #f4f4f4; }
    .jobs-list { max-height: 300px; overflow: auto; }
  </style>
</head>
<body>
  <h1>Real-Time Job Trend Analyzer</h1>
  <form method="POST" action="/">
    <input name="keyword" placeholder="Keyword (e.g. Data Analyst)" value="{{ keyword }}" required>
    <button type="submit">Scrape & Analyze Now</button>
  </form>

  {% if error %}
    <p style="color:red">{{ error }}</p>
  {% endif %}

  {% if count %}
    <p><strong>Total jobs scraped:</strong> {{ count }}</p>

    <div id="titles" class="chart"></div>
    <div id="skills" class="chart"></div>
    <div id="cities" class="chart"></div>
    <div id="trends" class="chart"></div>

    <script>
      Plotly.newPlot('titles', {{ plots.titles|safe }});
      Plotly.newPlot('skills', {{ plots.skills|safe }});
      Plotly.newPlot('cities', {{ plots.cities|safe }});
      Plotly.newPlot('trends', {{ plots.trends|safe }});
    </script>

    <h2>All Scraped Jobs</h2>
    <div class="jobs-list">
      <table>
        <tr>
          <th>Title</th><th>Company</th><th>Location</th><th>Source</th><th>Date Posted</th>
        </tr>
        {% for job in jobs %}
        <tr>
          <td>{{ job.title }}</td>
          <td>{{ job.company }}</td>
          <td>{{ job.location }}</td>
          <td>{{ job.source }}</td>
          <td>{{ job.date_posted }}</td>
        </tr>
        {% endfor %}
      </table>
    </div>

    <p><a href="/download"><button>Download CSV</button></a></p>
  {% endif %}
</body>
</html>
"""

INDEED_HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9"
}
DELAY = (1.0, 2.5)

def parse_indeed_date(date_str):
    if not date_str:
        return None
    date_str = date_str.lower().replace("posted", "").replace("active", "").strip()
    today = datetime.today()
    try:
        if "today" in date_str or "just posted" in date_str:
            return today.date()
        elif "+" in date_str:
            days_ago = int(date_str.split("+")[0].strip())
            return (today - timedelta(days=days_ago)).date()
        elif "day" in date_str:
            parts = date_str.split()
            days_ago = int(parts[0])
            return (today - timedelta(days=days_ago)).date()
        else:
            return None
    except:
        return None

def scrape_indeed(keyword, max_pages=5):
    jobs, session = [], requests.Session()
    session.headers.update(INDEED_HEADERS)
    kw = requests.utils.quote(keyword)
    for page in range(max_pages):
        url = f"https://www.indeed.com/jobs?q={kw}&start={page*10}"
        try:
            r = session.get(url, timeout=10)
            if r.status_code != 200:
                break
            soup = BeautifulSoup(r.text, "html.parser")
            if "captcha" in soup.text.lower():
                print("CAPTCHA detected on Indeed")
                break
            cards = soup.find_all("div", class_="job_seen_beacon")
            if not cards:
                break
            for c in cards:
                try:
                    t = c.find("h2", class_="jobTitle").get_text(strip=True)
                    co = c.find("span", class_="companyName").get_text(strip=True)
                    lo = c.find("div", class_="companyLocation").get_text(strip=True)
                    date_span = c.find("span", class_="date")
                    date_text = date_span.get_text(strip=True) if date_span else ""
                    parsed_date = parse_indeed_date(date_text)
                    jobs.append({
                        "title": t,
                        "company": co,
                        "location": lo,
                        "source": "Indeed",
                        "date_posted": parsed_date.isoformat() if parsed_date else ""
                    })
                except Exception as e:
                    continue
            time.sleep(random.uniform(*DELAY))
        except Exception as e:
            break
    return jobs

def scrape_linkedin(keyword, max_scroll=3):
    opts = Options()
    opts.add_argument("--headless")
    opts.add_argument("--disable-blink-features=AutomationControlled")
    opts.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
    opts.add_experimental_option("excludeSwitches", ["enable-automation"])
    opts.add_experimental_option("useAutomationExtension", False)
    
    driver = webdriver.Chrome(options=opts)
    driver.get(f"https://www.linkedin.com/jobs/search/?keywords={requests.utils.quote(keyword)}")
    
    for _ in range(max_scroll):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(random.uniform(1.5, 3.0))
        try:
            see_more_button = driver.find_element(By.XPATH, "//button[contains(@aria-label, 'See more jobs')]")
            driver.execute_script("arguments[0].click();", see_more_button)
            time.sleep(1)
        except NoSuchElementException:
            pass

    jobs = []
    cards = driver.find_elements(By.CSS_SELECTOR, "li.base-card")
    for c in cards:
        try:
            title = c.find_element(By.CSS_SELECTOR, "h3.base-search-card__title").text.strip()
            company = c.find_element(By.CSS_SELECTOR, "h4.base-search-card__subtitle").text.strip()
            loc = c.find_element(By.CSS_SELECTOR, "span.job-search-card__location").text.strip()
            time_element = c.find_element(By.CSS_SELECTOR, "time")
            date_posted = time_element.get_attribute("datetime")[:10] if time_element else ""
            jobs.append({
                "title": title,
                "company": company,
                "location": loc,
                "source": "LinkedIn",
                "date_posted": date_posted
            })
        except Exception as e:
            continue
    driver.quit()
    return jobs

def analyze_jobs(df):
    df["date"] = pd.to_datetime(df["date_posted"], errors="coerce")
    titles = df["title"].value_counts().head(5)
    skills = pd.Series(sum([t.lower().split() for t in df["title"]], [])).value_counts().head(10)
    cities = df["location"].value_counts().head(5)
    trends = df["date"].dt.date.value_counts().sort_index()
    return titles, skills, cities, trends

@app.route("/", methods=["GET","POST"])
def index():
    error = ""
    plots = {}
    jobs = []
    count = 0
    keyword = ""
    if request.method == "POST":
        keyword = request.form["keyword"].strip()
        if not keyword:
            error = "Please enter a keyword."
        else:
            try:
                indeed_jobs = scrape_indeed(keyword)
                linkedin_jobs = scrape_linkedin(keyword)
                jobs = indeed_jobs + linkedin_jobs
                if not jobs:
                    error = "No jobs found."
                else:
                    df = pd.DataFrame(jobs)
                    df.to_csv("jobs.csv", index=False)
                    count = len(df)
                    t, s, c, tr = analyze_jobs(df)
                    
                    # Prepare data for visualizations
                    titles_df = t.reset_index()
                    titles_df.columns = ['title', 'count']
                    skills_df = s.reset_index()
                    skills_df.columns = ['skill', 'count']
                    cities_df = c.reset_index()
                    cities_df.columns = ['city', 'count']
                    trends_df = tr.reset_index()
                    trends_df.columns = ['date', 'count']

                    # Create visualizations
                    fig1 = px.bar(titles_df, x='count', y='title', 
                                title="Top 5 Job Titles", labels={'count': 'Number of Jobs', 'title': 'Job Title'})
                    fig2 = px.bar(skills_df, x='count', y='skill', 
                                title="Top 10 Skills", labels={'count': 'Frequency', 'skill': 'Skill'})
                    fig3 = px.bar(cities_df, x='count', y='city', 
                                title="Top 5 Locations", labels={'count': 'Number of Jobs', 'city': 'Location'})
                    fig4 = px.line(trends_df, x='date', y='count', 
                                title="Posting Trends", labels={'date': 'Date', 'count': 'Number of Postings'})

                    plots = {
                        "titles": json.dumps(fig1, cls=plotly.utils.PlotlyJSONEncoder),
                        "skills": json.dumps(fig2, cls=plotly.utils.PlotlyJSONEncoder),
                        "cities": json.dumps(fig3, cls=plotly.utils.PlotlyJSONEncoder),
                        "trends": json.dumps(fig4, cls=plotly.utils.PlotlyJSONEncoder),
                    }
            except Exception as e:
                error = f"Error occurred during scraping: {str(e)}"
    return render_template_string(HTML, error=error, plots=plots, jobs=jobs, count=count, keyword=keyword)

@app.route("/download")
def download():
    if os.path.exists("jobs.csv"):
        return send_file("jobs.csv", as_attachment=True)
    return "No CSV found. Please scrape jobs first.", 404

if __name__ == "__main__":
    app.run(host="0.0.0.0", port=5000, debug=False, use_reloader=False)

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://192.168.5.230:5000
Press CTRL+C to quit
