In [1]:
# 统计文本文件中a-z字符（忽略大小写）的数量和比例

import os
from collections import defaultdict
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

def char_counter(file_path, file_name):
    os.chdir(file_path)
    counter = defaultdict(int)
    chars = string.ascii_lowercase
    with open(file_name, 'r') as file:
        for line in file:
            line = line.strip().lower()
            for ch in line:
                if ch in chars:
                    counter[ch] += 1
    
    df_counter = pd.DataFrame(counter, index=[0]).T
    df_counter.columns = ['num']
    df_counter['ratio(%)'] = df_counter['num']/df_counter['num'].sum()*100
    df_counter.sort_index(inplace=True)
    
    return df_counter

In [2]:
file_path = r'D:\userdata\anliu\Desktop\log\BTSLogs'
file_name = r'SYSLOG_271.LOG'

print(char_counter(file_path, file_name))

file_path = r'D:\userdata\anliu\Desktop\log\BTSLogs'
file_name = r'SYSLOG_272.LOG'

print(char_counter(file_path, file_name))

       num   ratio(%)
a  2224844   5.343536
b  1852672   4.449669
c  4884618  11.731668
d  2553488   6.132859
e  5404488  12.980269
f  1738555   4.175587
g   907902   2.180560
h   369749   0.888047
i  1192689   2.864550
j    25131   0.060359
k    96527   0.231834
l  2497721   5.998920
m   925671   2.223237
n  1865858   4.481338
o  1935668   4.649005
p  2184015   5.245474
q   138729   0.333193
r  2781760   6.681113
s  2656555   6.380401
t  1919278   4.609640
u  2061633   4.951542
v   161908   0.388864
w   136572   0.328013
x   609126   1.462973
y   104945   0.252052
z   406076   0.975296
       num   ratio(%)
a  2247213   5.369576
b  1883138   4.499642
c  4953875  11.836977
d  2575652   6.154361
e  5408369  12.922962
f  1752979   4.188635
g   922219   2.203585
h   355381   0.849161
i  1188541   2.839945
j    26001   0.062128
k    99938   0.238796
l  2514954   6.009327
m   933425   2.230361
n  1853532   4.428900
o  1956328   4.674524
p  2163939   5.170598
q   143607   0.343140
r  2801769

In [3]:
# 统计文本文件中英文单词的数量和比例
import re
import collections

def word_counter(file_path, file_name):
    os.chdir(file_path) 
    counter = defaultdict(int)
    with open(file_name, 'r') as file:
        words = []
        for line in file:
            words.extend(re.split(r'[;\.\:\s\,]', line))
        for word in words:
            if word.isalpha():
                counter[word] += 1

    df_counter = pd.DataFrame(counter, index=[0]).T
    df_counter.columns = ['num']
    df_counter['ratio(%)'] = df_counter['num']/df_counter['num'].sum()*100
    df_counter.sort_index(inplace=True)
    
    return df_counter

In [4]:
file_path = r'D:\userdata\anliu\Desktop\log\BTSLogs'
file_name = r'SYSLOG_271.LOG'

print(word_counter(file_path, file_name))

                                    num  ratio(%)
A                                   126  0.005457
ACK                                 294  0.012733
ACKD                                126  0.005457
ACKP                                126  0.005457
ADETBtsComponentsResourceMonitor      4  0.000173
ADETMemoryCleaner                     1  0.000043
AIF                                  44  0.001906
AL                                   92  0.003985
ALDU                                 48  0.002079
APW                                 789  0.034172
AaSNTPBaseStates                      8  0.000346
AaSNTPBaseStatesMsgHandler            4  0.000173
AaSNTPDBGmode                         4  0.000173
AaSNTPGetPeerStatus                  12  0.000520
AaSNTPSetStatusData                   8  0.000346
AaSysComMsg                           1  0.000043
AaSyslog                            801  0.034691
AaUtcZoneGet                         47  0.002036
Access                               56  0.002425


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import PorterStemmer

def word_counter1(file_path, file_name):
    os.chdir(file_path) 
    with open(file_name, 'r') as file:
        words_all = []
        for line in file:
            words_all.extend(re.split(r'[;\.\:\s\,]', line))
        words = []
        for word in words_all:
            if word.isalpha():
                words.append(word)

    return words

file_path = r'D:\userdata\anliu\Desktop\log\BTSLogs'
file_name = r'SYSLOG_271.LOG'
words = word_counter1(file_path, file_name)
stemmer = PorterStemmer()
words = [stemmer.stem(word) for word in words ]
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(words)
# print(X.toarray())  
print(vectorizer.get_feature_names())

['aa', 'aasntpbasest', 'aasntpbasestatesmsghandl', 'aasntpdbgmod', 'aasntpgetpeerstatu', 'aasntpsetstatusdata', 'aasyscommsg', 'aasyslog', 'aautczoneget', 'ab', 'abcdefgh', 'ac', 'accept', 'access', 'ack', 'ackd', 'ackeddata', 'ackedpacket', 'acknack', 'ackp', 'ackreceivedharqid', 'action', 'activ', 'actsdl', 'actual', 'ad', 'addantennarel', 'addfrrel', 'addit', 'addtosynclist', 'adetbtscomponentsresourcemonitor', 'adetmemoryclean', 'ae', 'af', 'agent', 'aif', 'al', 'alarm', 'aldu', 'aliv', 'alloc', 'alreadi', 'amountctrloctet', 'amountofbuffereddata', 'amountofbufferedsdu', 'amountofoctet', 'amountofwastedmemori', 'ampduseg', 'answer', 'antenna', 'antennahandl', 'app', 'apw', 'aqmdata', 'aqmpacket', 'area', 'arriv', 'asnpayloadid', 'assum', 'attempt', 'author', 'avail', 'averag', 'ba', 'band', 'base', 'bb', 'bbswitch', 'bbswitchsyncmanag', 'bc', 'bcch', 'bcn', 'bcnoversfn', 'bd', 'bearer', 'becaus', 'beg', 'begin', 'behaviour', 'bf', 'bin', 'binari', 'bit', 'bitrat', 'bler', 'bm', 'bo