# Analisis dataset webusage

In [1]:
import pandas as pd
from datetime import timedelta

# --- Ganti 'path/ke/file_log_anda.csv' dengan lokasi file Anda ---
log_file_path = 'webuage.csv'
# -----------------------------------------------------------------

# Tentukan batas waktu sesi
SESSION_TIMEOUT = timedelta(minutes=30)

# Daftar ekstensi file aset yang akan difilter (dibuang)
ASSET_EXTENSIONS = [
    '.css', '.js', '.jpg', '.jpeg', '.png', '.gif', '.ico',
    '.txt', '.svg', '.woff', '.woff2', '.ttf', '.eot', '.zip'
]

### Fungsi untuk load data

In [2]:
def load_data(file_path):
    """
    Memuat data log dari file CSV dan melakukan pengecekan awal.
    """
    print(f"Membaca log dari {file_path}...")

    col_names = [
        'IP', 'Logname', 'User', 'Time', 'Method',
        'URI', 'Protocol', 'Status', 'Size'
    ]

    df = None

    try:
        df = pd.read_csv(
            file_path,
            sep=',', header=None, names=col_names,
            skiprows=1, quotechar='"', on_bad_lines='skip',
            engine='python'
        )
    except FileNotFoundError:
        print(f"Error: File tidak ditemukan di {file_path}")
        return None
    except Exception as e:
        print(f"Error saat membaca file: {e}")
        return None

    if df is None:
        print("Dataframe could not be loaded. Stopping execution.")
        return None

    print("Pratinjau data mentah berhasil dibaca.")
    print("-" * 50)

    print(f"📊 Jumlah baris data mentah (Awal): {len(df)}")

    ip_counts = df['IP'].value_counts()
    print("\n🏆 Top 10 IP dengan akses terbanyak:")
    print(ip_counts.head(10))

    dupe_ips = ip_counts[ip_counts > 1]
    print(f"\n⚠️ Jumlah Unik IP: {len(ip_counts)}")
    print(f"⚠️ Jumlah IP yang 'Duplikat' (akses > 1 kali): {len(dupe_ips)}")
    print("-" * 50)

    print("\n🔍 Mengecek nilai NaN dalam dataset mentah...")
    nan_counts = df.isnull().sum()
    nan_percentages = (df.isnull().sum() / len(df)) * 100
    nan_info = pd.DataFrame({'NaN Count': nan_counts, 'NaN Percentage': nan_percentages})
    print(nan_info[nan_info['NaN Count'] > 0].to_string())
    print("Total NaN values in raw data:", df.isnull().sum().sum())
    print("-" * 50)

    return df

### Fungsi untuk memproses data

In [3]:
def preprocess_data(df, asset_extensions=ASSET_EXTENSIONS):
    """
    Melakukan pembersihan dan filtering data log.
    """
    if df is None:
        return None

    print("Memulai preprocessing (filter GET, 200, dan aset)... ")

    # 1. Filter: Hanya ambil Request Method 'GET'
    df_filtered = df[df['Method'] == 'GET'].copy()

    # 2. Filter: Hanya ambil Status '200' (OK)
    df_filtered['Status'] = pd.to_numeric(df_filtered['Status'], errors='coerce')
    df_filtered = df_filtered.dropna(subset=['Status'])
    df_filtered = df_filtered[df_filtered['Status'] == 200].copy()

    # 3. Filter: Hanya ambil yang berakhiran .html
    df_clean = df_filtered[df_filtered['URI'].fillna('').str.lower().str.endswith('.html')].copy()

    # Konversi kolom 'Time' ke format datetime
    print("Mengonversi waktu (dengan format ISO8601)...")
    df_clean['Time'] = pd.to_datetime(df_clean['Time'], format='ISO8601')

    print("Data setelah preprocessing (sebelum pengelompokan sesi):")
    print(df_clean.head().to_string())
    print("-" * 50)

    jumlah_awal = len(df)
    jumlah_akhir = len(df_clean)
    jumlah_dibuang = jumlah_awal - jumlah_akhir

    print(f"✅ Jumlah data setelah diproses (df_clean): {jumlah_akhir}")
    print(f"🗑️ Jumlah data 'sampah' (aset/error) yang dibuang: {jumlah_dibuang}")
    print(f"📉 Persentase penyusutan data: {(jumlah_dibuang/jumlah_awal)*100:.2f}%")
    print("-" * 50)

    return df_clean

### Fungsi untuk melakukan sessonize data

In [5]:
def sessionize_data(df_clean, session_timeout=SESSION_TIMEOUT):
    """
    Mengelompokkan data menjadi sesi berdasarkan IP dan batas waktu.
    """
    if df_clean is None:
        return None

    print(f"Mengelompokkan sesi (timeout: {session_timeout})...")

    df_clean = df_clean.sort_values(by=['IP', 'Time'])
    time_diff = df_clean.groupby('IP')['Time'].diff()
    is_new_session = (time_diff.isna()) | (time_diff > session_timeout)
    df_clean['SessionID'] = is_new_session.cumsum()

    print("\n--- PRATINJAU LOG DENGAN SESI ---")
    cols_to_show = ['SessionID', 'IP', 'Time', 'URI', 'Status']
    print(df_clean[cols_to_show].head(15).to_string())
    print("-" * 50)

    try:
        df_clean.to_csv('hasil_log_dengan_sesi.csv', index=False)
        print("\nData log yang sudah dikelompokkan disimpan ke 'hasil_log_dengan_sesi.csv'")
    except Exception as e:
        print(f"\nGagal menyimpan file 'hasil_log_dengan_sesi.csv': {e}")

    return df_clean

### Fungsi untuk membuat matrix

In [7]:
def create_matrix(df_sessionized):
    """
    Membuat matriks Sesi x Halaman dari data yang sudah disesikan.
    """
    if df_sessionized is None:
        return None

    print("\nMembuat matriks Sesi x Halaman...")

    matrix = pd.crosstab(
        index=[df_sessionized['SessionID'], df_sessionized['IP']],
        columns=df_sessionized['URI']
    )
    matrix_binary = (matrix > 0).astype(int)
    final_output = matrix_binary.reset_index().drop('SessionID', axis=1)

    print("\n--- HASIL AKHIR (MATRIKS) ---")
    print(final_output.to_string())

    try:
        final_output.to_csv('hasil_matriks_sesi.csv', index=False)
        print("\nMatriks hasil juga disimpan ke 'hasil_matriks_sesi.csv'")
    except Exception as e:
        print(f"\nGagal menyimpan file 'hasil_matriks_sesi.csv': {e}")

    return final_output

### Memulai proses pemanggilan fungsi

In [8]:
df_raw = load_data(log_file_path)
display(df_raw.head())

Membaca log dari webuage.csv...
Pratinjau data mentah berhasil dibaca.
--------------------------------------------------
📊 Jumlah baris data mentah (Awal): 132258

🏆 Top 10 IP dengan akses terbanyak:
IP
66.249.63.228     4
66.249.205.141    3
66.249.250.163    3
66.249.9.101      3
66.249.235.90     3
65.55.153.155     3
66.249.221.28     3
66.249.172.48     3
66.249.89.78      3
134.34.67.219     3
Name: count, dtype: int64

⚠️ Jumlah Unik IP: 129854
⚠️ Jumlah IP yang 'Duplikat' (akses > 1 kali): 2306
--------------------------------------------------

🔍 Mengecek nilai NaN dalam dataset mentah...
Empty DataFrame
Columns: [NaN Count, NaN Percentage]
Index: []
Total NaN values in raw data: 0
--------------------------------------------------


Unnamed: 0,IP,Logname,User,Time,Method,URI,Protocol,Status,Size
0,65.55.147.227,-,-,2009-10-15T02:00:24Z,GET,/index.html,HTTP/1.1,200,21878
1,65.55.86.34,-,-,2009-10-15T02:00:58Z,GET,/index.html,HTTP/1.1,200,1416
2,148.188.55.88,-,-,2009-10-15T02:01:41Z,GET,/faq.html,HTTP/1.1,200,10946
3,72.30.57.238,-,-,2009-10-15T02:01:59Z,GET,/contribute.txt,HTTP/1.0,200,39943
4,66.249.139.233,-,-,2009-10-15T02:02:09Z,GET,/faq.html,HTTP/1.1,200,17247


In [9]:
df_cleaned = preprocess_data(df_raw)
display(df_cleaned.head())

Memulai preprocessing (filter GET, 200, dan aset)... 
Mengonversi waktu (dengan format ISO8601)...


Data setelah preprocessing (sebelum pengelompokan sesi):
               IP Logname User                      Time Method          URI  Protocol  Status   Size
0   65.55.147.227       -    - 2009-10-15 02:00:24+00:00    GET  /index.html  HTTP/1.1     200  21878
1     65.55.86.34       -    - 2009-10-15 02:00:58+00:00    GET  /index.html  HTTP/1.1     200   1416
2   148.188.55.88       -    - 2009-10-15 02:01:41+00:00    GET    /faq.html  HTTP/1.1     200  10946
4  66.249.139.233       -    - 2009-10-15 02:02:09+00:00    GET    /faq.html  HTTP/1.1     200  17247
5    72.30.50.248       -    - 2009-10-15 02:02:13+00:00    GET  /index.html  HTTP/1.0     200   7883
--------------------------------------------------
✅ Jumlah data setelah diproses (df_clean): 75656
🗑️ Jumlah data 'sampah' (aset/error) yang dibuang: 56602
📉 Persentase penyusutan data: 42.80%
--------------------------------------------------


Unnamed: 0,IP,Logname,User,Time,Method,URI,Protocol,Status,Size
0,65.55.147.227,-,-,2009-10-15 02:00:24+00:00,GET,/index.html,HTTP/1.1,200,21878
1,65.55.86.34,-,-,2009-10-15 02:00:58+00:00,GET,/index.html,HTTP/1.1,200,1416
2,148.188.55.88,-,-,2009-10-15 02:01:41+00:00,GET,/faq.html,HTTP/1.1,200,10946
4,66.249.139.233,-,-,2009-10-15 02:02:09+00:00,GET,/faq.html,HTTP/1.1,200,17247
5,72.30.50.248,-,-,2009-10-15 02:02:13+00:00,GET,/index.html,HTTP/1.0,200,7883


In [10]:
df_session = sessionize_data(df_cleaned, SESSION_TIMEOUT)
display(df_session.head())

Mengelompokkan sesi (timeout: 0:30:00)...

--- PRATINJAU LOG DENGAN SESI ---
       SessionID               IP                      Time          URI  Status
64146          1  109.192.104.209 2009-10-18 18:27:27+00:00  /index.html     200
64147          2   109.192.104.86 2009-10-18 18:27:27+00:00  /index.html     200
64115          3  109.192.109.233 2009-10-18 18:26:27+00:00    /faq.html     200
65195          4  109.192.111.243 2009-10-18 20:59:37+00:00  /index.html     200
64123          5  109.192.117.151 2009-10-18 18:26:27+00:00  /index.html     200
64117          6  109.192.121.165 2009-10-18 18:26:27+00:00    /faq.html     200
65162          7    109.192.135.2 2009-10-18 20:58:33+00:00  /index.html     200
65168          8  109.192.138.239 2009-10-18 20:58:33+00:00    /faq.html     200
64140          9  109.192.143.138 2009-10-18 18:26:27+00:00    /faq.html     200
65185         10    109.192.147.1 2009-10-18 20:58:44+00:00    /faq.html     200
64124         11   109.192.156.9

Unnamed: 0,IP,Logname,User,Time,Method,URI,Protocol,Status,Size,SessionID
64146,109.192.104.209,-,-,2009-10-18 18:27:27+00:00,GET,/index.html,HTTP/1.1,200,19395,1
64147,109.192.104.86,-,-,2009-10-18 18:27:27+00:00,GET,/index.html,HTTP/1.1,200,99209,2
64115,109.192.109.233,-,-,2009-10-18 18:26:27+00:00,GET,/faq.html,HTTP/1.1,200,757,3
65195,109.192.111.243,-,-,2009-10-18 20:59:37+00:00,GET,/index.html,HTTP/1.1,200,10937,4
64123,109.192.117.151,-,-,2009-10-18 18:26:27+00:00,GET,/index.html,HTTP/1.1,200,285,5


In [11]:
final_matrix = create_matrix(df_session)
display(final_matrix.head())


Membuat matriks Sesi x Halaman...

--- HASIL AKHIR (MATRIKS) ---
URI                 IP  /faq.html  /index.html
0      109.192.104.209          0            1
1       109.192.104.86          0            1
2      109.192.109.233          1            0
3      109.192.111.243          0            1
4      109.192.117.151          0            1
5      109.192.121.165          1            0
6        109.192.135.2          0            1
7      109.192.138.239          1            0
8      109.192.143.138          1            0
9        109.192.147.1          1            0
10      109.192.156.94          0            1
11     109.192.159.149          0            1
12      109.192.163.11          1            0
13     109.192.167.241          0            1
14      109.192.17.160          1            0
15       109.192.173.8          0            1
16     109.192.174.205          1            0
17     109.192.177.231          0            1
18     109.192.185.107          0        

URI,IP,/faq.html,/index.html
0,109.192.104.209,0,1
1,109.192.104.86,0,1
2,109.192.109.233,1,0
3,109.192.111.243,0,1
4,109.192.117.151,0,1
