# 日志

In [None]:
# 如何抽取日志中的有效信息

In [3]:
import datetime

line = """183.60.212.153 - - [19/Feb/2013:10:23:29 +0800] \
"GET /o2o/media.html?menu=3 HTTP/1.1" 200 16691 "-" \
"Mezilla/5.0 (compatible; EasouSpider; +http://www.easou.com/search/spider.html)"
"""

CHARS = set(" \t")  # 常量，大写

def makekey(line:str):
    start = 0
    skip = False
    for i, c in enumerate(line):
        if not skip and c in '"[':
            start = i + 1
            skip = True
        elif skip and c in '"]':
            skip = False
            yield line[start:i]
            start = i + 1
            continue
            
        if skip:
            continue
            
        if c in CHARS:
            if start == i:
                start = i + 1
                continue
            yield line[start:i]   
            start = i + 1
        
    else:
        if start < len(line):
            yield line[start:]   
            
    
names = ("remote", "-", "-", "datetime", "protocol", "status", "size", "-", "useragent")
ops = (None, None, None,
       lambda datestr: datetime.datetime.strptime(datestr, "%d/%b/%Y:%H:%M:%S %z"),
       lambda x: x.split(),
       int, int, None, None) 

d = {name:data if op is None else op(data) for name,op, data in zip(names, ops, makekey(line))}
print(d)

{'remote': '183.60.212.153', '-': '-', 'datetime': datetime.datetime(2013, 2, 19, 10, 23, 29, tzinfo=datetime.timezone(datetime.timedelta(seconds=28800))), 'protocol': ['GET', '/o2o/media.html?menu=3', 'HTTP/1.1'], 'status': 200, 'size': 16691, 'useragent': 'Mezilla/5.0 (compatible; EasouSpider; +http://www.easou.com/search/spider.html)'}


In [9]:
import datetime
import re

line = """183.60.212.153 - - [19/Feb/2013:10:23:29 +0800] \
"GET /o2o/media.html?menu=3 HTTP/1.1" 200 16691 "-" \
"Mezilla/5.0 (compatible; EasouSpider; +http://www.easou.com/search/spider.html)"
"""

PATTERN = '''(?P<remote>[\d\.]{7,})\s-\s-\s\[(?P<datetime>[^\[\]]+)\]\s"(?P<protocol>[^"]+)"\s(?P<status>\d{3})\s(?P<size>\d+)\s"[^"]+"\s"(?P<useragent>[^"]+)"'''
regex = re.compile(PATTERN)

ops = {"datetime": lambda datestr: datetime.datetime.strptime(datestr, "%d/%b/%Y:%H:%M:%S %z"),
       "protocol": lambda x: dict(zip(("method", "url", "protocol"), x.split())),
       "status": int,
       "size": int}

def extract(line:str):
    matcher = regex.match(line)
    if matcher:
        return {name:ops.get(name, lambda x: x)(data) for name, data in matcher.groupdict().items()}
    else:
        return None

print(extract(line))

{'remote': '183.60.212.153', 'datetime': datetime.datetime(2013, 2, 19, 10, 23, 29, tzinfo=datetime.timezone(datetime.timedelta(seconds=28800))), 'protocol': {'method': 'GET', 'url': '/o2o/media.html?menu=3', 'protocol': 'HTTP/1.1'}, 'status': 200, 'size': 16691, 'useragent': 'Mezilla/5.0 (compatible; EasouSpider; +http://www.easou.com/search/spider.html)'}


# 日志分析

In [None]:
一般采集流程：
日志产出 -> 采集(Logstash、Flume、Scribe) -> 存储 -> 分析 -> 存储(数据库、NoSQL) -> 可视化

In [None]:
文本分析依赖下面三项技术：
文件操作，
字符串操作，
正则表达式。

# 生产者和消费者、queue

In [None]:
解决办法：队列queue
作用：解耦、缓冲

In [None]:
单机：queue内建的模块构建进程内的队列，满足多个线程间的生成消费需要。
大型项目：使用第三方消息中间件——RabbitMQ、SocketMQ、Kafka。

# 面向对象

In [None]:
一切皆对象
对象是数据/属性和操作/方法/行为的封装
对象时独立的，但是对象之间可以相互作用
目前OOP是最接近人类认知的编程范式

In [8]:
class A:
    def __init__(self, name:str="hi"):
        self.name = name

print(A.__name__)
print(A.__class__)
print(A().__class__)
print(A.__class__.__name__)
print(A().__class__.__name__)

A
<class 'type'>
<class '__main__.A'>
type
A


In [7]:
print(1, sorted(A.__dict__.items()))
hi = A()
print(2, hi.__class__)
print(3, sorted(hi.__dict__.items()))
print(4, hi.__dict__)
print(5, sorted(hi.__class__.__dict__.items()))

1 [('__dict__', <attribute '__dict__' of 'A' objects>), ('__doc__', None), ('__init__', <function A.__init__ at 0x0000000005BAA598>), ('__module__', '__main__'), ('__weakref__', <attribute '__weakref__' of 'A' objects>)]
2 <class '__main__.A'>
3 [('name', 'hi')]
4 {'name': 'hi'}
5 [('__dict__', <attribute '__dict__' of 'A' objects>), ('__doc__', None), ('__init__', <function A.__init__ at 0x0000000005BAA598>), ('__module__', '__main__'), ('__weakref__', <attribute '__weakref__' of 'A' objects>)]


# 私有变量和保护变量

In [None]:
__：私有成员 = 私有变量 + 私有方法
_：保护成员 = 保护变量 + 保护方法

# property

In [4]:
class A:
    
    def __init__(self, name):
        self.__name = name
        
    @property  # 只读，== getter
    def name(self):
        return self.__name
    
    @name.setter  # 只读，== getter
    def name(self, name):
        self.__name = name
    
    @name.deleter  # 只读，== getter
    def name(self):
        del self.__name