In [5]:
import csv
import datetime as dt
import json
import os
import statistics
import time
from requests.exceptions import RequestException

import numpy as np
import pandas as pd
import requests
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup

In [137]:
# reading required APPIDs
raw_data = pd.read_csv('cortags_ids.csv', index_col= False, header = None)

In [114]:
raw_data = raw_data.rename(columns={0: 'AppID'})

In [9]:
class SteamChartsScraper:
    def __init__(self, dataset_path, output_path, save_interval=100, log_interval=50, timeout=10, check_interval=30):
        """
        Initialize the scraper.
        :param dataset_path: path to the dataset with AppIDs
        :param output_path: path to save results
        :param save_interval: save interval (number of IDs)
        :param log_interval: logging interval (number of rows)
        :param timeout: request timeout (in seconds)
        :param check_interval: connection check interval (in seconds)
        """
        self.dataset_path = dataset_path
        self.output_path = output_path
        self.save_interval = save_interval
        self.log_interval = log_interval
        self.timeout = timeout
        self.check_interval = check_interval
        self.df_input = pd.read_csv(dataset_path)
        self.df_output = None
        self.processed_ids = set()
        self.successful_count_total = 0
        self.total_processed_total = 0
        self.successful_count_session = 0
        self.total_processed_session = 0
        
    def initialize_output(self):
        """Initialize the output dataset."""
        columns = ['AppID', 'Month1', 'Month2', 'AvgPlayers1', 'AvgPlayers2', 'Notes']
        if os.path.exists(self.output_path):
            self.df_output = pd.read_csv(self.output_path)
            self.processed_ids = set(self.df_output['AppID'])
            self.successful_count_total = len(self.df_output[self.df_output['Notes'] == ''])
            self.total_processed_total = len(self.df_output)
        else:
            self.df_output = pd.DataFrame(columns=columns)
            
    def save_progress(self):
        """Save current results to file."""
        self.df_output.to_csv(self.output_path, index=False)
        print(f"Progress saved to {self.output_path}")
        
    def log_progress(self, force_final=False):
        """Log processing progress with overall statistics."""
        if force_final or self.total_processed_session >= self.log_interval:
            total_success_rate = (self.successful_count_total / self.total_processed_total * 100) if self.total_processed_total > 0 else 0
            session_success_rate = (self.successful_count_session / self.total_processed_session * 100) if self.total_processed_session > 0 else 0
            
            print(f"\n=== Processing Progress ===")
            print(f"OVERALL STATISTICS:")
            print(f"  Total processed: {self.total_processed_total} rows")
            print(f"  Successful requests: {self.successful_count_total} ({total_success_rate:.1f}%)")
            print(f"SESSION STATISTICS:")
            print(f"  Processed in session: {self.total_processed_session} rows")
            print(f"  Successful in session: {self.successful_count_session} ({session_success_rate:.1f}%)")
            print(f"REMAINING TO PROCESS: {len(self.df_input) - self.total_processed_total} rows")
            print("=" * 30)
            
            if not force_final:
                self.successful_count_session = 0
                self.total_processed_session = 0
    
    def wait_for_internet_connection(self):
        """Infinitely wait for internet connection to be restored."""
        print("Internet connection loss detected...")
        print("Waiting for connection to be restored...")
        
        while True:
            try:
                # Try to make a simple request to test connection
                test_response = requests.get("https://www.google.com", timeout=10)
                test_response.raise_for_status()
                print("✓ Internet connection restored! Resuming work...")
                return True
            except (RequestException, requests.Timeout):
                print(f"✗ No connection. Next check in {self.check_interval} seconds...")
                time.sleep(self.check_interval)
    
    def test_steamcharts_connection(self):
        """Check SteamCharts availability."""
        try:
            test_response = requests.get("https://steamcharts.com", timeout=self.timeout)
            test_response.raise_for_status()
            return True
        except (RequestException, requests.Timeout):
            return False
    
    def get_game_data(self, app_id):
        """Get game data from SteamCharts with infinite connection waiting."""
        url = f"https://steamcharts.com/app/{app_id}"
        
        while True:
            try:
                response = requests.get(url, timeout=self.timeout)
                response.raise_for_status()
                soup = BeautifulSoup(response.content, 'html.parser')
                
                # Find data table
                table = soup.find('table', {'class': 'common-table'})
                if not table:
                    return None, None, None, None, "NoData"
                    
                rows = table.find_all('tr')[1:]  # Skip header row
                if not rows:
                    return None, None, None, None, "EmptyTable"
                    
                # Take last two months (first two months after release)
                rows = rows[-2:] if len(rows) >= 2 else rows
                
                month1, month2 = None, None
                avg_players1, avg_players2 = None, None
                
                # Process rows in correct order (from new to old)
                if len(rows) >= 1:
                    cells = rows[-1].find_all('td')  # Take oldest record (first month)
                    if len(cells) >= 2:
                        month1 = cells[0].text.strip()
                        try:
                            players_text = cells[1].text.strip().replace(',', '')
                            avg_players1 = float(players_text) if players_text else None
                        except ValueError:
                            avg_players1 = None
                            
                if len(rows) >= 2:
                    cells = rows[-2].find_all('td')  # Take second oldest record (second month)
                    if len(cells) >= 2:
                        month2 = cells[0].text.strip()
                        try:
                            players_text = cells[1].text.strip().replace(',', '')
                            avg_players2 = float(players_text) if players_text else None
                        except ValueError:
                            avg_players2 = None
                            
                return month1, month2, avg_players1, avg_players2, ""
                
            except requests.Timeout:
                # For timeouts, don't wait infinitely, just return error
                return None, None, None, None, "Timeout"
                
            except requests.HTTPError as e:
                # HTTP errors (404, 500, etc.) are not connection-related
                if e.response.status_code == 404:
                    return None, None, None, None, "NotFound"
                else:
                    return None, None, None, None, f"HTTPError: {e.response.status_code}"
                    
            except RequestException:
                # NetworkError - wait for connection to be restored
                self.wait_for_internet_connection()
                # After connection restored, continue while True loop
                continue
                
            except Exception as e:
                return None, None, None, None, f"Error: {str(e)[:20]}"
            
    def process_dataset(self):
        """Main method for processing the dataset."""
        self.initialize_output()
        
        print(f"Starting processing. Total records in dataset: {len(self.df_input)}")
        print(f"Already processed: {self.total_processed_total}")
        
        # Initial connection check
        if not self.test_steamcharts_connection():
            print("SteamCharts unavailable. Checking internet connection...")
            self.wait_for_internet_connection()
        
        for index, row in self.df_input.iterrows():
            app_id = row['AppID']
            
            # Skip already processed IDs
            if app_id in self.processed_ids:
                continue
                
            month1, month2, avg_players1, avg_players2, notes = self.get_game_data(app_id)
            
            # Add results to dataset
            new_row = {
                'AppID': app_id,
                'Month1': month1,
                'Month2': month2,
                'AvgPlayers1': avg_players1,
                'AvgPlayers2': avg_players2,
                'Notes': notes
            }
            self.df_output = pd.concat([self.df_output, pd.DataFrame([new_row])], ignore_index=True)
            
            # Update counters
            self.total_processed_total += 1
            self.total_processed_session += 1
            
            if notes == "":
                self.successful_count_total += 1
                self.successful_count_session += 1
                
            # Logging every m rows
            if self.total_processed_session % self.log_interval == 0:
                self.log_progress()
                
            # Saving every n rows
            if self.total_processed_total % self.save_interval == 0:
                self.save_progress()
                
            # Delay to avoid server overload
            time.sleep(0.3)
            
        # Final save and logging
        self.save_progress()
        self.log_progress(force_final=True)
        print("\nProcessing completed!")
        
    def run(self):
        """Start processing."""
        try:
            self.process_dataset()
        except KeyboardInterrupt:
            print("\nProcess interrupted by user. Saving progress...")
            self.save_progress()
            self.log_progress(force_final=True)
            print("Progress saved. Program terminated.")

In [11]:
scraper = SteamChartsScraper(
    dataset_path='IDS.csv',
    output_path='steamcharts_ccu1_results.csv', 
    save_interval=500,
    log_interval=1000,               
    timeout=15                      
)

In [13]:
scraper.run()

Starting processing. Total records in dataset: 32370
Already processed: 32370
Progress saved to steamcharts_ccu1_results.csv

=== Processing Progress ===
OVERALL STATISTICS:
  Total processed: 32370 rows
  Successful requests: 0 (0.0%)
SESSION STATISTICS:
  Processed in session: 0 rows
  Successful in session: 0 (0.0%)
REMAINING TO PROCESS: 0 rows

Processing completed!
