# Chardet  
`pip install chardet`

In [1]:
text = '哈'.encode('utf8')
print(text)

b'\xe5\x93\x88'


In [3]:
import chardet

#with open('dataset/redmansion_big5.txt', 'r', encoding='utf-8') as f:
with open('dataset/redmansion_big5.txt', 'rb') as f:
    text = f.read()
    en = chardet.detect(text)
    print(chardet.detect(text))

with open('dataset/redmansion_big5_utf8.txt', 'w', encoding='utf-8') as f:
    text_utf8 = text.decode(en['encoding'], errors='ignore')
    f.write(text_utf8)

{'encoding': 'Big5', 'confidence': 0.99, 'language': 'Chinese'}


# OpenCC 
`pip install opencc-python-reimplemented`  
 ref: https://github.com/yichen0831/opencc-python

In [2]:
from opencc import OpenCC
cc = OpenCC('s2t')  # convert from Simplified Chinese to Traditional Chinese
# can also set conversion by calling set_conversion
# cc.set_conversion('s2tw')
to_convert = '开放中文转换'
converted = cc.convert(to_convert)
print(converted)

開放中文轉換


# Uppercase and Lowercase

In [3]:
text = 'In computer science, artificial intelligence (AI), sometimes called machine intelligence, is intelligence demonstrated by machines, in contrast to the natural intelligence displayed by humans'
print('before lowercase')
print(text)
print('===================')
print('after lowercase')
#print(text.lower())
print(text.upper())


before lowercase
In computer science, artificial intelligence (AI), sometimes called machine intelligence, is intelligence demonstrated by machines, in contrast to the natural intelligence displayed by humans
after lowercase
IN COMPUTER SCIENCE, ARTIFICIAL INTELLIGENCE (AI), SOMETIMES CALLED MACHINE INTELLIGENCE, IS INTELLIGENCE DEMONSTRATED BY MACHINES, IN CONTRAST TO THE NATURAL INTELLIGENCE DISPLAYED BY HUMANS


In [5]:
print('isaac'.islower())
print('ISAAC'.islower())
print('ISAAC'.isupper())
print('iSAAC'.isupper())

True
False
True
False


# Sentence Tokenization

In [4]:
import nltk
from nltk.tokenize import sent_tokenize 

nltk.download('punkt')
text = 'Backgammon is one of the oldest known board games. Its history can be traced back nearly 5,000 years to archeological discoveries in the Middle East. It is a two player game where each player has fifteen checkers which move between twenty-four points according to the roll of two dice.'
sentences = sent_tokenize(text)
for sentence in sentences:
    print(sentence+'\n')


Backgammon is one of the oldest known board games.

Its history can be traced back nearly 5,000 years to archeological discoveries in the Middle East.

It is a two player game where each player has fifteen checkers which move between twenty-four points according to the roll of two dice.



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Word Tokenization

In [7]:
import nltk
from nltk.tokenize import word_tokenize 

nltk.download('punkt')
text = 'Backgammon is one of the oldest known board games. Its history can be traced back nearly 5,000 years to archeological discoveries in the Middle East. It is a two player game where each player has fifteen checkers which move between twenty-four points according to the roll of two dice.'
words = word_tokenize(text)
print(words)

['Backgammon', 'is', 'one', 'of', 'the', 'oldest', 'known', 'board', 'games', '.', 'Its', 'history', 'can', 'be', 'traced', 'back', 'nearly', '5,000', 'years', 'to', 'archeological', 'discoveries', 'in', 'the', 'Middle', 'East', '.', 'It', 'is', 'a', 'two', 'player', 'game', 'where', 'each', 'player', 'has', 'fifteen', 'checkers', 'which', 'move', 'between', 'twenty-four', 'points', 'according', 'to', 'the', 'roll', 'of', 'two', 'dice', '.']


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Stemming

In [8]:
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer

#nltk.download('all')
pst = PorterStemmer()
snowball = SnowballStemmer('english')

print(pst.stem('eating'))
print(pst.stem('shopping'))

print(snowball.stem('eating'))
print(snowball.stem('shopping'))


eat
shop
eat
shop


# Lemmatization 

In [5]:
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')


wnl = WordNetLemmatizer()
# second parameter can be n, v, a ......
print(wnl.lemmatize('cars', 'n'))
print(wnl.lemmatize('men', 'n'))

# lemmatize verbs
print(wnl.lemmatize('running', 'v'))
print(wnl.lemmatize('ate', 'v'))

# lemmatize adjectives
print(wnl.lemmatize('saddest', 'a'))
print(wnl.lemmatize('fancier', 'a'))

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


car
men
run
eat
sad
fancy


In [10]:
sentence = 'The brown fox is quick and he is jumping over the lazy dog'
import nltk
tokens = nltk.word_tokenize(sentence)
tagged_sent = nltk.pos_tag(tokens)
print(tagged_sent)

[('The', 'DT'), ('brown', 'JJ'), ('fox', 'NN'), ('is', 'VBZ'), ('quick', 'JJ'), ('and', 'CC'), ('he', 'PRP'), ('is', 'VBZ'), ('jumping', 'VBG'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN')]


In [11]:
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

sentence = 'football is a family of team sports that involve, to varying degrees, kicking a ball to score a goal.'
tokens = word_tokenize(sentence)  
tagged_sent = pos_tag(tokens)     

wnl = WordNetLemmatizer()
lemmas_sent = []
for tag in tagged_sent:
    wordnet_pos = get_wordnet_pos(tag[1]) or wordnet.NOUN
    lemmas_sent.append(wnl.lemmatize(tag[0], pos=wordnet_pos))
print(lemmas_sent)

['football', 'be', 'a', 'family', 'of', 'team', 'sport', 'that', 'involve', ',', 'to', 'vary', 'degree', ',', 'kick', 'a', 'ball', 'to', 'score', 'a', 'goal', '.']


# Stop words

In [6]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
# nltk supports 22 languages for removing the stop words
print(stopwords.words("english"))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
stop_words = set(stopwords.words("english"))
sentence = "Backgammon is one of the oldest known board games."

words = nltk.word_tokenize(sentence)
without_stop_words = [word for word in words if not word in stop_words]
print(without_stop_words)

['Backgammon', 'one', 'oldest', 'known', 'board', 'games', '.']
