# Tugas 6: Web Usage Mining

In [4]:
import pandas as pd

In [5]:
df = pd.read_csv('./data/webuage.csv')
df

Unnamed: 0,Remote host,Remote logname,Remote user,Request time,Request method,Request URI,Request Protocol,Status,Size of response (incl. headers)
0,65.55.147.227,-,-,2009-10-15T02:00:24Z,GET,/index.html,HTTP/1.1,200,21878
1,65.55.86.34,-,-,2009-10-15T02:00:58Z,GET,/index.html,HTTP/1.1,200,1416
2,148.188.55.88,-,-,2009-10-15T02:01:41Z,GET,/faq.html,HTTP/1.1,200,10946
3,72.30.57.238,-,-,2009-10-15T02:01:59Z,GET,/contribute.txt,HTTP/1.0,200,39943
4,66.249.139.233,-,-,2009-10-15T02:02:09Z,GET,/faq.html,HTTP/1.1,200,17247
...,...,...,...,...,...,...,...,...,...
132253,66.249.196.230,-,-,2009-10-22T01:58:21Z,GET,/contribute.txt,HTTP/1.1,200,16569
132254,66.249.41.85,-,-,2009-10-22T01:58:22Z,GET,/contribute.txt,HTTP/1.1,200,380
132255,65.55.236.245,-,-,2009-10-22T01:59:33Z,GET,/index.html,HTTP/1.1,200,6695
132256,65.55.198.69,-,-,2009-10-22T01:59:34Z,GET,/contribute.txt,HTTP/1.1,200,47672


In [6]:
# Konversi kolom Request time ke datetime dengan format ISO8601
df['Request time'] = pd.to_datetime(df['Request time'], format='ISO8601')

# Konversi ke timezone WIB (UTC+7) - langsung convert karena sudah ada timezone
df['Request time'] = df['Request time'].dt.tz_convert('Asia/Jakarta')

# Tampilkan hasil
print("Data setelah konversi ke WIB:")
print(f"Timezone: {df['Request time'].dt.tz}")
print("\nBeberapa baris pertama:")
df.head()

Data setelah konversi ke WIB:
Timezone: Asia/Jakarta

Beberapa baris pertama:


Unnamed: 0,Remote host,Remote logname,Remote user,Request time,Request method,Request URI,Request Protocol,Status,Size of response (incl. headers)
0,65.55.147.227,-,-,2009-10-15 09:00:24+07:00,GET,/index.html,HTTP/1.1,200,21878
1,65.55.86.34,-,-,2009-10-15 09:00:58+07:00,GET,/index.html,HTTP/1.1,200,1416
2,148.188.55.88,-,-,2009-10-15 09:01:41+07:00,GET,/faq.html,HTTP/1.1,200,10946
3,72.30.57.238,-,-,2009-10-15 09:01:59+07:00,GET,/contribute.txt,HTTP/1.0,200,39943
4,66.249.139.233,-,-,2009-10-15 09:02:09+07:00,GET,/faq.html,HTTP/1.1,200,17247


In [7]:
# Filter data dengan status 200 dan Request URI mengandung .html
df_filtered = df[(df['Status'] == 200) & (df['Request URI'].str.contains('.html', na=False))]

print(f"Total data sebelum filter: {len(df)}")
print(f"Total data setelah filter: {len(df_filtered)}")
print("\nData yang telah difilter:")
df_filtered

Total data sebelum filter: 132258
Total data setelah filter: 75718

Data yang telah difilter:


Unnamed: 0,Remote host,Remote logname,Remote user,Request time,Request method,Request URI,Request Protocol,Status,Size of response (incl. headers)
0,65.55.147.227,-,-,2009-10-15 09:00:24+07:00,GET,/index.html,HTTP/1.1,200,21878
1,65.55.86.34,-,-,2009-10-15 09:00:58+07:00,GET,/index.html,HTTP/1.1,200,1416
2,148.188.55.88,-,-,2009-10-15 09:01:41+07:00,GET,/faq.html,HTTP/1.1,200,10946
4,66.249.139.233,-,-,2009-10-15 09:02:09+07:00,GET,/faq.html,HTTP/1.1,200,17247
5,72.30.50.248,-,-,2009-10-15 09:02:13+07:00,GET,/index.html,HTTP/1.0,200,7883
...,...,...,...,...,...,...,...,...,...
132249,65.55.251.17,-,-,2009-10-22 08:57:24+07:00,GET,/index.html,HTTP/1.1,200,12300
132250,67.195.151.244,-,-,2009-10-22 08:57:57+07:00,GET,/index.html,HTTP/1.0,200,19828
132251,65.55.178.30,-,-,2009-10-22 08:58:02+07:00,GET,/index.html,HTTP/1.1,200,95655
132252,67.195.176.248,-,-,2009-10-22 08:58:07+07:00,GET,/faq.html,HTTP/1.0,200,35002


In [8]:
# Buat copy dari dataframe yang difilter untuk menghindari warning
df_filtered = df_filtered.copy()

# Buat ID user berdasarkan kombinasi Remote host dan Request Protocol
# Kombinasi yang sama akan mendapatkan ID yang sama
df_filtered['user_id'] = df_filtered.groupby(['Remote host', 'Request Protocol']).ngroup() + 1

print(f"Total unique user ID: {df_filtered['user_id'].nunique()}")
print("\nContoh data dengan user_id:")
print(df_filtered[['Remote host', 'Request Protocol', 'user_id', 'Request URI', 'Request time']].head(10))
print("\n--- Verifikasi: User yang sama memiliki ID yang sama ---")
print(df_filtered[df_filtered['Remote host'] == '65.55.147.227'][['Remote host', 'Request Protocol', 'user_id']].head())

Total unique user ID: 74905

Contoh data dengan user_id:
       Remote host Request Protocol  user_id  Request URI  \
0    65.55.147.227         HTTP/1.1    39204  /index.html   
1      65.55.86.34         HTTP/1.1    43124  /index.html   
2    148.188.55.88         HTTP/1.1    13434    /faq.html   
4   66.249.139.233         HTTP/1.1    44916    /faq.html   
5     72.30.50.248         HTTP/1.0    52199  /index.html   
8      65.55.80.97         HTTP/1.1    43011  /index.html   
9     65.55.161.41         HTTP/1.1    39548  /index.html   
10   65.55.119.204         HTTP/1.1    38556    /faq.html   
11    65.55.58.168         HTTP/1.1    42464  /index.html   
12     65.55.35.29         HTTP/1.1    41967    /faq.html   

                Request time  
0  2009-10-15 09:00:24+07:00  
1  2009-10-15 09:00:58+07:00  
2  2009-10-15 09:01:41+07:00  
4  2009-10-15 09:02:09+07:00  
5  2009-10-15 09:02:13+07:00  
8  2009-10-15 09:02:51+07:00  
9  2009-10-15 09:02:54+07:00  
10 2009-10-15 09:02:55+

In [9]:
# Hitung jumlah Request URI yang unik
unique_uri_count = df_filtered['Request URI'].nunique()
print(f"Jumlah Request URI unik: {unique_uri_count}")

# Tampilkan beberapa Request URI unik
print("\nContoh Request URI unik:")
print(df_filtered['Request URI'].unique()[:20])

# Tampilkan top 10 Request URI yang paling sering diakses
print("\nTop 10 Request URI yang paling sering diakses:")
print(df_filtered['Request URI'].value_counts().head(10))

Jumlah Request URI unik: 2

Contoh Request URI unik:
['/index.html' '/faq.html']

Top 10 Request URI yang paling sering diakses:
Request URI
/index.html    50438
/faq.html      25280
Name: count, dtype: int64


In [10]:
# Sort data berdasarkan user_id dan waktu
df_sorted = df_filtered.sort_values(['user_id', 'Request time']).reset_index(drop=True)

# Hitung durasi waktu di setiap halaman (selisih waktu antar request)
df_sorted['duration'] = df_sorted.groupby('user_id')['Request time'].diff().dt.total_seconds()

# Jika duration adalah NaN (request pertama) atau negatif, set ke 0
# Jika terlalu besar (> 1 jam = 3600 detik), set ke 0 (kemungkinan session baru)
df_sorted['duration'] = df_sorted['duration'].fillna(0)
df_sorted['duration'] = df_sorted['duration'].apply(lambda x: x if 0 < x < 3600 else 0)

print("Data dengan durasi:")
print(df_sorted[['user_id', 'Request URI', 'Request time', 'duration']].head(20))

Data dengan durasi:
    user_id  Request URI              Request time  duration
0         1  /index.html 2009-10-19 01:27:27+07:00       0.0
1         2  /index.html 2009-10-19 01:27:27+07:00       0.0
2         3    /faq.html 2009-10-19 01:26:27+07:00       0.0
3         4  /index.html 2009-10-19 03:59:37+07:00       0.0
4         5  /index.html 2009-10-19 01:26:27+07:00       0.0
5         6    /faq.html 2009-10-19 01:26:27+07:00       0.0
6         7  /index.html 2009-10-19 03:58:33+07:00       0.0
7         8    /faq.html 2009-10-19 03:58:33+07:00       0.0
8         9    /faq.html 2009-10-19 01:26:27+07:00       0.0
9        10    /faq.html 2009-10-19 03:58:44+07:00       0.0
10       11  /index.html 2009-10-19 01:26:27+07:00       0.0
11       12  /index.html 2009-10-19 03:58:33+07:00       0.0
12       13    /faq.html 2009-10-19 03:58:45+07:00       0.0
13       14  /index.html 2009-10-19 01:26:27+07:00       0.0
14       15    /faq.html 2009-10-19 03:58:45+07:00       0.0
15  

In [11]:
# Filter data dengan duration yang tidak 0
df_filtered2 = df_sorted[df_sorted['duration'] != 0]

print(f"Total data sebelum filter: {len(df_sorted)}")
print(f"Total data setelah filter (duration != 0): {len(df_filtered2)}")
print("\nData yang telah difilter:")
df_filtered2

Total data sebelum filter: 75718
Total data setelah filter (duration != 0): 60

Data yang telah difilter:


Unnamed: 0,Remote host,Remote logname,Remote user,Request time,Request method,Request URI,Request Protocol,Status,Size of response (incl. headers),user_id,duration
6577,134.226.99.103,-,-,2009-10-15 17:18:48+07:00,GET,/index.html,HTTP/1.1,200,9819,6572,506.0
6742,134.34.108.221,-,-,2009-10-20 23:29:23+07:00,GET,/faq.html,HTTP/1.1,200,12774,6734,852.0
7095,134.34.128.85,-,-,2009-10-15 18:56:09+07:00,GET,/faq.html,HTTP/1.1,200,21499,7073,3012.0
7535,134.34.154.44,-,-,2009-10-21 21:24:24+07:00,GET,/faq.html,HTTP/1.1,200,13635,7500,2519.0
8553,134.34.208.24,-,-,2009-10-20 22:22:25+07:00,GET,/index.html,HTTP/1.1,200,10937,8494,3548.0
9427,134.34.26.187,-,-,2009-10-21 21:08:32+07:00,GET,/faq.html,HTTP/1.1,200,251,9343,2159.0
9554,134.34.33.129,-,-,2009-10-21 20:27:34+07:00,GET,/index.html,HTTP/1.1,200,23043,9466,1.0
9664,134.34.39.243,-,-,2009-10-20 22:32:34+07:00,GET,/index.html,HTTP/1.1,200,13678,9571,2949.0
9956,134.34.55.24,-,-,2009-10-21 17:17:57+07:00,GET,/index.html,HTTP/1.1,200,12774,9852,2997.0
11384,141.24.161.4,-,-,2009-10-21 19:22:56+07:00,GET,/faq.html,HTTP/1.1,200,9260,11255,114.0
