# 文本清洗
## 1. 剔除重复行

In [4]:
import pandas as pd

data = [[1, 2, 3], [1, 2, 3], [4, 5, 6]]
df = pd.DataFrame(data)

df.drop_duplicates(inplace=True)

display(df)


Unnamed: 0,0,1,2
0,1,2,3
2,4,5,6


<br>

## 2. 剔除文本两侧的空格符

In [2]:
import pandas as pd

data = [["  天河  ", "海珠", "白云"], ["番禺", "  荔湾", "越秀  "]]
df = pd.DataFrame(data)

for col in df.columns:
    df[col] = df[col].apply(lambda item: item.strip())

display(df)


Unnamed: 0,0,1,2
0,天河,海珠,白云
1,番禺,荔湾,越秀


<br>

## 3. 剔除纯数字

In [1]:
my_list = ["你好", "123", "世界"]

for item in my_list:
    if item.isdigit():
        my_list.remove(item)

print(my_list)


['你好', '世界']


<br>

## 4. 剔除手机号

In [12]:
import re

my_list = ["你好", "13798812258", "我的电话是13798812258", "12312312312312314341", "世界"]

for item in my_list:
    phone_list = re.compile("(?<!\d)(1\d{10})(?!\d)").findall(item)
    if phone_list:
        my_list.remove(item)

print(my_list)


['你好', '我的电话是13798812258', '12312312312312314341', '世界']


<br>

## 5. 剔除网站
`if "http" in item:` 是一个有效的代码，你可以添加各种需要过滤的符号，但注意它不能写成 `or` 形式，必须单成一行

In [9]:
my_list = ["你好", "https://www.baidu.com", "世界"]

for item in my_list:
    if "http" in item:
        my_list.remove(item)

print(my_list)


['你好', '世界']


<br>

## 6. 剔除纯英文

In [10]:
my_list = ["你好", "abadsa", "123abc", "123", "世界"]

for item in my_list:
    if item.encode("UTF-8").isalpha():
        my_list.remove(item)

print(my_list)


['你好', '123abc', '123', '世界']


<br>

## 7. 剔除纯英文或者纯数字

In [11]:
my_list = ["你好", "abadsa", "123abc", "123", "世界"]

for item in my_list:
    if item.encode("UTF-8").isalnum():
        my_list.remove(item)

print(my_list)


['你好', '123abc', '世界']


<br>

## 8. 剔除纯特殊字符

In [24]:
import re

my_list = ["?你好", "13798812258", "我的电话是13798812258?", "？。。", "世界..."]

for item in my_list:
    symbol_list = re.compile("[^0-9A-Za-z\u4e00-\u9fa5]").match(item)
    if symbol_list:
        my_list.remove(item)

print(my_list)


['13798812258', '我的电话是13798812258?', '世界...']
