In [1]:
import pandas as pd
from datetime import timedelta

# --- Ganti 'path/ke/file_log_anda.csv' dengan lokasi file Anda ---
log_file_path = 'NASA_Jul95_cleaned.csv'
# -----------------------------------------------------------------

# Tentukan batas waktu sesi
SESSION_TIMEOUT = timedelta(minutes=30)

# Daftar ekstensi file aset yang akan difilter (dibuang)
ASSET_EXTENSIONS = [
    '.css', '.js', '.jpg', '.jpeg', '.png', '.gif', '.ico',
    '.txt', '.svg', '.woff', '.woff2', '.ttf', '.eot', '.zip'
]

## Fungsi untuk load data

In [2]:
def load_data(file_path):
    """
    Memuat data log dari file CSV dan melakukan pengecekan awal.
    """
    print(f"Membaca log dari {file_path}...")

    col_names = [
        'IP', 'Logname', 'User', 'Time', 'Method',
        'URI', 'Protocol', 'Status', 'Size'
    ]

    df = None

    try:
        df = pd.read_csv(
            file_path,
            sep=',', header=None, names=col_names,
            skiprows=1, quotechar='"', on_bad_lines='skip',
            engine='python'
        )
    except FileNotFoundError:
        print(f"Error: File tidak ditemukan di {file_path}")
        return None
    except Exception as e:
        print(f"Error saat membaca file: {e}")
        return None

    if df is None:
        print("Dataframe could not be loaded. Stopping execution.")
        return None

    print("Pratinjau data mentah berhasil dibaca.")
    print("-" * 50)

    print(f"üìä Jumlah baris data mentah (Awal): {len(df)}")

    ip_counts = df['IP'].value_counts()
    print("\nüèÜ Top 10 IP dengan akses terbanyak:")
    print(ip_counts.head(10))

    dupe_ips = ip_counts[ip_counts > 1]
    print(f"\n‚ö†Ô∏è Jumlah Unik IP: {len(ip_counts)}")
    print(f"‚ö†Ô∏è Jumlah IP yang 'Duplikat' (akses > 1 kali): {len(dupe_ips)}")
    print("-" * 50)

    print("\nüîç Mengecek nilai NaN dalam dataset mentah...")
    nan_counts = df.isnull().sum()
    nan_percentages = (df.isnull().sum() / len(df)) * 100
    nan_info = pd.DataFrame({'NaN Count': nan_counts, 'NaN Percentage': nan_percentages})
    print(nan_info[nan_info['NaN Count'] > 0].to_string())
    print("Total NaN values in raw data:", df.isnull().sum().sum())
    print("-" * 50)

    return df

## Fungsi untuk memproses data

In [3]:
def preprocess_data(df, asset_extensions=ASSET_EXTENSIONS):
    """
    Melakukan pembersihan dan filtering data log.
    """
    if df is None:
        return None

    print("Memulai preprocessing (filter GET, 200, dan aset)... ")

    # 1. Filter: Hanya ambil Request Method 'GET'
    df_filtered = df[df['Method'] == 'GET'].copy()

    # 2. Filter: Hanya ambil Status '200' (OK)
    df_filtered['Status'] = pd.to_numeric(df_filtered['Status'], errors='coerce')
    df_filtered = df_filtered.dropna(subset=['Status'])
    df_filtered = df_filtered[df_filtered['Status'] == 200].copy()

    # 3. Filter: Hanya ambil yang berakhiran .html
    df_clean = df_filtered[df_filtered['URI'].fillna('').str.lower().str.endswith('.html')].copy()

    # Konversi kolom 'Time' ke format datetime
    print("Mengonversi waktu (dengan format ISO8601)...")
    df_clean['Time'] = pd.to_datetime(df_clean['Time'], format='ISO8601')

    print("Data setelah preprocessing (sebelum pengelompokan sesi):")
    print(df_clean.head().to_string())
    print("-" * 50)

    jumlah_awal = len(df)
    jumlah_akhir = len(df_clean)
    jumlah_dibuang = jumlah_awal - jumlah_akhir

    print(f"‚úÖ Jumlah data setelah diproses (df_clean): {jumlah_akhir}")
    print(f"üóëÔ∏è Jumlah data 'sampah' (aset/error) yang dibuang: {jumlah_dibuang}")
    print(f"üìâ Persentase penyusutan data: {(jumlah_dibuang/jumlah_awal)*100:.2f}%")
    print("-" * 50)

    return df_clean

## Fungsi untuk melakukan sessonize data

In [4]:
def sessionize_data(df_clean, session_timeout=SESSION_TIMEOUT):
    """
    Mengelompokkan data menjadi sesi berdasarkan IP dan batas waktu.
    """
    if df_clean is None:
        return None

    print(f"Mengelompokkan sesi (timeout: {session_timeout})...")

    df_clean = df_clean.sort_values(by=['IP', 'Time'])
    time_diff = df_clean.groupby('IP')['Time'].diff()
    is_new_session = (time_diff.isna()) | (time_diff > session_timeout)
    df_clean['SessionID'] = is_new_session.cumsum()

    print("\n--- PRATINJAU LOG DENGAN SESI ---")
    cols_to_show = ['SessionID', 'IP', 'Time', 'URI', 'Status']
    print(df_clean[cols_to_show].head(15).to_string())
    print("-" * 50)

    try:
        df_clean.to_csv('hasil_log_dengan_sesiNASA.csv', index=False)
        print("\nData log yang sudah dikelompokkan disimpan ke 'hasil_log_dengan_sesiNASA.csv'")
    except Exception as e:
        print(f"\nGagal menyimpan file 'hasil_log_dengan_sesiNASA.csv': {e}")

    return df_clean

## Fungsi untuk membuat matrix

In [5]:
def create_matrix(df_sessionized):
    """
    Membuat matriks Sesi x Halaman dari data yang sudah disesikan.
    """
    if df_sessionized is None:
        return None

    print("\nMembuat matriks Sesi x Halaman...")

    matrix = pd.crosstab(
        index=[df_sessionized['SessionID'], df_sessionized['IP']],
        columns=df_sessionized['URI']
    )
    matrix_binary = (matrix > 0).astype(int)
    final_output = matrix_binary.reset_index().drop('SessionID', axis=1)

    print("\n--- HASIL AKHIR (MATRIKS) ---")
    print(final_output.to_string())

    try:
        final_output.to_csv('hasil_matriks_sesiNASA.csv', index=False)
        print("\nMatriks hasil juga disimpan ke 'hasil_matriks_sesiNASA.csv'")
    except Exception as e:
        print(f"\nGagal menyimpan file 'hasil_matriks_sesiNASA.csv': {e}")

    return final_output

## Memulai proses pemanggilan fungsi



In [6]:
df_raw = load_data(log_file_path)
display(df_raw.head())

Membaca log dari NASA_Jul95_cleaned.csv...
Pratinjau data mentah berhasil dibaca.
--------------------------------------------------
üìä Jumlah baris data mentah (Awal): 411522

üèÜ Top 10 IP dengan akses terbanyak:
IP
piweba3y.prodigy.com     4677
alyssa.prodigy.com       2950
piweba1y.prodigy.com     1984
disarray.demon.co.uk     1837
www-b6.proxy.aol.com     1738
news.ti.com              1453
piweba2y.prodigy.com     1189
poppy.hensa.ac.uk        1105
advantis.vnet.ibm.com    1046
www-d1.proxy.aol.com     1041
Name: count, dtype: int64

‚ö†Ô∏è Jumlah Unik IP: 27724
‚ö†Ô∏è Jumlah IP yang 'Duplikat' (akses > 1 kali): 25181
--------------------------------------------------

üîç Mengecek nilai NaN dalam dataset mentah...
Empty DataFrame
Columns: [NaN Count, NaN Percentage]
Index: []
Total NaN values in raw data: 0
--------------------------------------------------


Unnamed: 0,IP,Logname,User,Time,Method,URI,Protocol,Status,Size
0,199.72.81.55,-,-,1995-07-01T00:00:01Z,GET,/history/apollo/,HTTP/1.0,200,6245
1,unicomp6.unicomp.net,-,-,1995-07-01T00:00:06Z,GET,/shuttle/countdown/,HTTP/1.0,200,3985
2,199.120.110.21,-,-,1995-07-01T00:00:09Z,GET,/shuttle/missions/sts-73/mission-sts-73.html,HTTP/1.0,200,4085
3,burger.letters.com,-,-,1995-07-01T00:00:11Z,GET,/shuttle/countdown/liftoff.html,HTTP/1.0,304,0
4,199.120.110.21,-,-,1995-07-01T00:00:11Z,GET,/shuttle/missions/sts-73/sts-73-patch-small.gif,HTTP/1.0,200,4179


In [7]:
df_cleaned = preprocess_data(df_raw)
display(df_cleaned.head())

Memulai preprocessing (filter GET, 200, dan aset)... 
Mengonversi waktu (dengan format ISO8601)...
Data setelah preprocessing (sebelum pengelompokan sesi):
                           IP Logname User                      Time Method                                           URI  Protocol  Status   Size
2              199.120.110.21       -    - 1995-07-01 00:00:09+00:00    GET  /shuttle/missions/sts-73/mission-sts-73.html  HTTP/1.0     200   4085
7             205.212.115.106       -    - 1995-07-01 00:00:12+00:00    GET             /shuttle/countdown/countdown.html  HTTP/1.0     200   3985
18  ppptky391.asahi-net.or.jp       -    - 1995-07-01 00:00:18+00:00    GET                         /facts/about_ksc.html  HTTP/1.0     200   3977
22   waters-gw.starway.net.au       -    - 1995-07-01 00:00:25+00:00    GET      /shuttle/missions/51-l/mission-51-l.html  HTTP/1.0     200   6723
37     gayle-gaston.tenet.edu       -    - 1995-07-01 00:00:50+00:00    GET  /shuttle/missions/sts-71/mission

Unnamed: 0,IP,Logname,User,Time,Method,URI,Protocol,Status,Size
2,199.120.110.21,-,-,1995-07-01 00:00:09+00:00,GET,/shuttle/missions/sts-73/mission-sts-73.html,HTTP/1.0,200,4085
7,205.212.115.106,-,-,1995-07-01 00:00:12+00:00,GET,/shuttle/countdown/countdown.html,HTTP/1.0,200,3985
18,ppptky391.asahi-net.or.jp,-,-,1995-07-01 00:00:18+00:00,GET,/facts/about_ksc.html,HTTP/1.0,200,3977
22,waters-gw.starway.net.au,-,-,1995-07-01 00:00:25+00:00,GET,/shuttle/missions/51-l/mission-51-l.html,HTTP/1.0,200,6723
37,gayle-gaston.tenet.edu,-,-,1995-07-01 00:00:50+00:00,GET,/shuttle/missions/sts-71/mission-sts-71.html,HTTP/1.0,200,12040


In [8]:
df_session = sessionize_data(df_cleaned, SESSION_TIMEOUT)
display(df_session.head())

Mengelompokkan sesi (timeout: 0:30:00)...

--- PRATINJAU LOG DENGAN SESI ---
        SessionID                               IP                      Time                                                     URI  Status
299938          1  01-dynamic-c.wokingham.luna.net 1995-07-05 07:36:59+00:00             /shuttle/missions/sts-71/movies/movies.html     200
247446          2  03-dynamic-c.wokingham.luna.net 1995-07-04 12:33:06+00:00            /shuttle/missions/sts-70/mission-sts-70.html     200
247464          2  03-dynamic-c.wokingham.luna.net 1995-07-04 12:33:23+00:00            /shuttle/missions/sts-71/mission-sts-71.html     200
247629          2  03-dynamic-c.wokingham.luna.net 1995-07-04 12:36:10+00:00             /shuttle/missions/sts-71/movies/movies.html     200
207417          3  04-dynamic-c.rotterdam.luna.net 1995-07-03 21:56:38+00:00            /shuttle/missions/sts-70/mission-sts-70.html     200
207447          3  04-dynamic-c.rotterdam.luna.net 1995-07-03 21:57:07+00:00 

Unnamed: 0,IP,Logname,User,Time,Method,URI,Protocol,Status,Size,SessionID
299938,01-dynamic-c.wokingham.luna.net,-,-,1995-07-05 07:36:59+00:00,GET,/shuttle/missions/sts-71/movies/movies.html,HTTP/1.0,200,3089,1
247446,03-dynamic-c.wokingham.luna.net,-,-,1995-07-04 12:33:06+00:00,GET,/shuttle/missions/sts-70/mission-sts-70.html,HTTP/1.0,200,13469,2
247464,03-dynamic-c.wokingham.luna.net,-,-,1995-07-04 12:33:23+00:00,GET,/shuttle/missions/sts-71/mission-sts-71.html,HTTP/1.0,200,12451,2
247629,03-dynamic-c.wokingham.luna.net,-,-,1995-07-04 12:36:10+00:00,GET,/shuttle/missions/sts-71/movies/movies.html,HTTP/1.0,200,3089,2
207417,04-dynamic-c.rotterdam.luna.net,-,-,1995-07-03 21:56:38+00:00,GET,/shuttle/missions/sts-70/mission-sts-70.html,HTTP/1.0,200,13468,3


In [9]:
final_matrix = create_matrix(df_session)
display(final_matrix.head())


Membuat matriks Sesi x Halaman...

--- HASIL AKHIR (MATRIKS) ---

Matriks hasil juga disimpan ke 'hasil_matriks_sesiNASA.csv'


URI,IP,//ksc.html,//shuttle/missions/missions.html,/base-ops/procurement/procurement.html,/biomed/bibliography/biblio.html,/biomed/climate/airqual.html,/biomed/climate/climate.html,/biomed/env.html,/biomed/fire/fire.html,/biomed/glossary/glossary.html,...,/statistics/1995/May/May95_reverse_domains.html,/statistics/1995/bkup/Apr95_full.html,/statistics/1995/bkup/Feb95_full.html,/statistics/1995/bkup/Jan95_full.html,/statistics/1995/bkup/Mar95.html,/statistics/1995/bkup/Mar95_full.html,/statistics/statistics.html,/welcome.html,/whats-new.html,/~downs/home.html
0,01-dynamic-c.wokingham.luna.net,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,03-dynamic-c.wokingham.luna.net,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,04-dynamic-c.rotterdam.luna.net,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,04-dynamic-c.wokingham.luna.net,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,05-dynamic-c.rotterdam.luna.net,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
