In [1]:
pip install pandas


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [13]:
pip install tabulate


Collecting tabulateNote: you may need to restart the kernel to use updated packages.

  Downloading tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Downloading tabulate-0.9.0-py3-none-any.whl (35 kB)
Installing collected packages: tabulate
Successfully installed tabulate-0.9.0



[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


##  Step 1 : Read the Log File

In [15]:
import pandas as pd
import re
from collections import Counter
from tabulate import tabulate


In [3]:
with open("sample.log", "r") as file:
    log_data = file.readlines()

### Step 2: Parse the log file


In [4]:
pattern = r'(?P<ip>\d+\.\d+\.\d+\.\d+) .* "(?P<method>GET|POST) (?P<endpoint>/\S*) HTTP/1.1" (?P<status>\d+) (?P<size>\d+)'
log_entries = [re.match(pattern, line) for line in log_data if re.match(pattern, line)]
parsed_logs = [match.groupdict() for match in log_entries]

### Step 3: Convert parsed logs to a DataFrame


In [6]:
df = pd.DataFrame(parsed_logs)
df

Unnamed: 0,ip,method,endpoint,status,size
0,192.168.1.1,GET,/home,200,512
1,203.0.113.5,POST,/login,401,128
2,10.0.0.2,GET,/about,200,256
3,192.168.1.1,GET,/contact,200,312
4,198.51.100.23,POST,/register,200,128
5,203.0.113.5,POST,/login,401,128
6,192.168.1.100,POST,/login,401,128
7,10.0.0.2,GET,/dashboard,200,1024
8,198.51.100.23,GET,/about,200,256
9,192.168.1.1,GET,/dashboard,200,1024


## Step 4 : Analyze the Log Data

### 1.Calculate Requests Per IP

In [8]:
# Count requests per IP
ip_counts = Counter(df['ip'])
ip_df = pd.DataFrame(ip_counts.items(), columns=["IP Address", "Request Count"]).sort_values(by="Request Count", ascending=False)
ip_df

Unnamed: 0,IP Address,Request Count
1,203.0.113.5,8
3,198.51.100.23,8
0,192.168.1.1,7
2,10.0.0.2,6
4,192.168.1.100,5


### 2.Identify the Most Frequently Accessed Endpoint

In [9]:
# Count requests per endpoint
endpoint_counts = Counter(df['endpoint'])
endpoint_df = pd.DataFrame(endpoint_counts.items(), columns=["Endpoint", "Access Count"]).sort_values(by="Access Count", ascending=False)
endpoint_df

Unnamed: 0,Endpoint,Access Count
1,/login,13
0,/home,5
2,/about,5
5,/dashboard,3
3,/contact,2
4,/register,2
6,/profile,2
7,/feedback,2


### 3.Detect Suspicious Activity

In [11]:
# Detect brute force login attempts
failed_logins = df[df['status'] == '401']
failed_counts = Counter(failed_logins['ip'])
suspicious_ips_df = pd.DataFrame(failed_counts.items(), columns=["IP Address", "Failed Login Count"]).sort_values(by="Failed Login Count", ascending=False)
suspicious_ips_df

Unnamed: 0,IP Address,Failed Login Count
0,203.0.113.5,8
1,192.168.1.100,5


## Step 5. Output Results

### 1.Display Results in Terminal

In [12]:
with open("log_analysis_results.csv", "w") as f:
    # Write Requests per IP
    f.write("Requests per IP\n")
    ip_df.to_csv(f, index=False)
    f.write("\n")

    # Write Most Accessed Endpoint
    f.write("Most Accessed Endpoint\n")
    endpoint_df.to_csv(f, index=False)
    f.write("\n")

    # Write Suspicious Activity
    f.write("Suspicious Activity\n")
    suspicious_ips_df.to_csv(f, index=False)

print("\nResults saved to 'log_analysis_results.csv'")


Results saved to 'log_analysis_results.csv'


In [16]:
data = pd.read_csv("log_analysis_results.csv")

print("\n--- Combined Log Analysis Results ---")
print(tabulate(data, headers="keys", tablefmt="grid"))



--- Combined Log Analysis Results ---
+------------------------+--------------------+
|                        | Requests per IP    |
| IP Address             | Request Count      |
+------------------------+--------------------+
| 203.0.113.5            | 8                  |
+------------------------+--------------------+
| 198.51.100.23          | 8                  |
+------------------------+--------------------+
| 192.168.1.1            | 7                  |
+------------------------+--------------------+
| 10.0.0.2               | 6                  |
+------------------------+--------------------+
| 192.168.1.100          | 5                  |
+------------------------+--------------------+
| Most Accessed Endpoint | nan                |
+------------------------+--------------------+
| Endpoint               | Access Count       |
+------------------------+--------------------+
| /login                 | 13                 |
+------------------------+--------------------+
|