In [4]:
import nltk
import pandas as pd
import numpy as np
from nltk.tokenize import sent_tokenize

corpus = 'With its easily transported, thick-skinned and sweet-tasting fruit, the Gros Michel banana plant dominated the plantations of Central America. United Fruit, the main grower and exporter in South America at the time, mass-produced its bananas in the most efficient way possible: it cloned shoots from the stems of plants instead of growing plants from seeds, and cultivated them in densely packed fields.'
sentences = sent_tokenize(corpus)
print(sentences)

['With its easily transported, thick-skinned and sweet-tasting fruit, the Gros Michel banana plant dominated the plantations of Central America.', 'United Fruit, the main grower and exporter in South America at the time, mass-produced its bananas in the most efficient way possible: it cloned shoots from the stems of plants instead of growing plants from seeds, and cultivated them in densely packed fields.']


In [8]:
from nltk.tokenize import word_tokenize
print(word_tokenize(corpus))

['With', 'its', 'easily', 'transported', ',', 'thick-skinned', 'and', 'sweet-tasting', 'fruit', ',', 'the', 'Gros', 'Michel', 'banana', 'plant', 'dominated', 'the', 'plantations', 'of', 'Central', 'America', '.', 'United', 'Fruit', ',', 'the', 'main', 'grower', 'and', 'exporter', 'in', 'South', 'America', 'at', 'the', 'time', ',', 'mass-produced', 'its', 'bananas', 'in', 'the', 'most', 'efficient', 'way', 'possible', ':', 'it', 'cloned', 'shoots', 'from', 'the', 'stems', 'of', 'plants', 'instead', 'of', 'growing', 'plants', 'from', 'seeds', ',', 'and', 'cultivated', 'them', 'in', 'densely', 'packed', 'fields', '.']


In [10]:
tokens = []
for sentence in sentences:
    t = word_tokenize(sentence)
    tokens.extend(t)
print(tokens)

['With', 'its', 'easily', 'transported', ',', 'thick-skinned', 'and', 'sweet-tasting', 'fruit', ',', 'the', 'Gros', 'Michel', 'banana', 'plant', 'dominated', 'the', 'plantations', 'of', 'Central', 'America', '.', 'United', 'Fruit', ',', 'the', 'main', 'grower', 'and', 'exporter', 'in', 'South', 'America', 'at', 'the', 'time', ',', 'mass-produced', 'its', 'bananas', 'in', 'the', 'most', 'efficient', 'way', 'possible', ':', 'it', 'cloned', 'shoots', 'from', 'the', 'stems', 'of', 'plants', 'instead', 'of', 'growing', 'plants', 'from', 'seeds', ',', 'and', 'cultivated', 'them', 'in', 'densely', 'packed', 'fields', '.']


In [24]:
from nltk.corpus import stopwords

unique_stops = set(stopwords.words('english'))
no_stops = []
for token in tokens:
    token = token.lower()
    if token not in unique_stops and token.isalpha():
        no_stops.append(token)
print(no_stops)

['easily', 'transported', 'fruit', 'gros', 'michel', 'banana', 'plant', 'dominated', 'plantations', 'central', 'america', 'united', 'fruit', 'main', 'grower', 'exporter', 'south', 'america', 'time', 'bananas', 'efficient', 'way', 'possible', 'cloned', 'shoots', 'stems', 'plants', 'instead', 'growing', 'plants', 'seeds', 'cultivated', 'densely', 'packed', 'fields']


In [32]:
from nltk.stem import WordNetLemmatizer
nltk.download('omw-1.4')
lemmatizer = WordNetLemmatizer()
lemmatized = []

for token in no_stops:
    token = lemmatizer.lemmatize(token)
    lemmatized.append(token)
print (lemmatized)

['easily', 'transported', 'fruit', 'gros', 'michel', 'banana', 'plant', 'dominated', 'plantation', 'central', 'america', 'united', 'fruit', 'main', 'grower', 'exporter', 'south', 'america', 'time', 'banana', 'efficient', 'way', 'possible', 'cloned', 'shoot', 'stem', 'plant', 'instead', 'growing', 'plant', 'seed', 'cultivated', 'densely', 'packed', 'field']


[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\vika1\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [38]:
#Bag of Words

from collections import Counter
bow_counter = Counter(lemmatized)
print(bow_counter)

Counter({'plant': 3, 'fruit': 2, 'banana': 2, 'america': 2, 'easily': 1, 'transported': 1, 'gros': 1, 'michel': 1, 'dominated': 1, 'plantation': 1, 'central': 1, 'united': 1, 'main': 1, 'grower': 1, 'exporter': 1, 'south': 1, 'time': 1, 'efficient': 1, 'way': 1, 'possible': 1, 'cloned': 1, 'shoot': 1, 'stem': 1, 'instead': 1, 'growing': 1, 'seed': 1, 'cultivated': 1, 'densely': 1, 'packed': 1, 'field': 1})


In [36]:
print(bow_counter.most_common(10))

[('plant', 3), ('fruit', 2), ('banana', 2), ('america', 2), ('easily', 1), ('transported', 1), ('gros', 1), ('michel', 1), ('dominated', 1), ('plantation', 1)]


In [48]:
#CountVectonizer

from sklearn.feature_extraction.text import CountVectorizer
vectonizer = CountVectorizer(analyzer = 'word',
                            lowercase = True,
                            preprocessor = None,
                            tokenizer = None,
                            stop_words = {'english'},
                            max_features = 5000)
count_matrix = vectonizer.fit_transform(sentences)
count_array = count_matrix.toarray()
count_df = pd.DataFrame(data = count_array, columns = vectonizer.get_feature_names_out())
count_df

Unnamed: 0,america,and,at,banana,bananas,central,cloned,cultivated,densely,dominated,...,sweet,tasting,the,them,thick,time,transported,united,way,with
0,1,1,0,1,0,1,0,0,0,1,...,1,1,2,0,1,0,1,0,0,1
1,1,2,1,0,1,0,1,1,1,0,...,0,0,4,1,0,1,0,1,1,0


In [55]:
#TF-IDF 1 way CountVectonizer + TfidfTransformer
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer(smooth_idf = True, use_idf = True)
#IDF
tfidf_transformer.fit(count_array)
#TF*IDF
tf_idf_vector = tfidf_transformer.transform(count_array)

df_tfidf = pd.DataFrame(tf_idf_vector.toarray(), columns = vectorizer.get_feature_names_out())
df_tfidf.T

Unnamed: 0,0,1
america,0.165172,0.094517
and,0.165172,0.189033
at,0.0,0.13284
banana,0.232143,0.0
bananas,0.0,0.13284
central,0.232143,0.0
cloned,0.0,0.13284
cultivated,0.0,0.13284
densely,0.0,0.13284
dominated,0.232143,0.0


In [62]:
#TF-IDF 2 way TdidfVectonizer

from sklearn.feature_extraction.text import TfidfVectorizer
tfidfVect = TfidfVectorizer(use_idf = True, stop_words = 'english')
tfidf = tfidfVect.fit_transform(sentences)
df_tfidf = pd.DataFrame(tfidf.toarray(), columns = tfidfVect.get_feature_names_out())
df_tfidf.T

Unnamed: 0,0,1
america,0.197242,0.136898
banana,0.277217,0.0
bananas,0.0,0.192406
central,0.277217,0.0
cloned,0.0,0.192406
cultivated,0.0,0.192406
densely,0.0,0.192406
dominated,0.277217,0.0
easily,0.277217,0.0
efficient,0.0,0.192406


In [66]:
#Среднее знач. TF-IDF для каждого слова по всем текстам
mean_weighs = np.array(tfidf.mean(axis=0)).ravel().tolist()
mean_weighs_df = pd.DataFrame({'term':tfidfVect.get_feature_names_out(),'mean_weigh':mean_weighs})
mean_weighs_df.sort_values(by = 'mean_weigh', ascending = False).reset_index(drop = True).head(10)

Unnamed: 0,term,mean_weigh
0,plants,0.192406
1,america,0.16707
2,fruit,0.16707
3,sweet,0.138608
4,plant,0.138608
5,michel,0.138608
6,banana,0.138608
7,skinned,0.138608
8,gros,0.138608
9,plantations,0.138608


In [83]:
#Косинусной расстояние между текст.векторами
text1 = 'Science is a systematic enterprise that builds and organizes knowledge in the form of testable explanations and predictions about the universe.'
text2 = 'Science texts use lots of unfamiliar terms.'
corpus = [text1, text2]
tfIdfVectorizer = TfidfVectorizer(use_idf = True, stop_words = 'english')
tfIdf = tfIdfVectorizer.fit_transform(corpus)
tfIdf_df = pd.DataFrame(tfIdf.toarray(), index = ['vector1', 'vector2'], columns = tfIdfVectorizer.get_feature_names_out())
tfIdf_df

vector1 = tfIdf.toarray()[0]
vector2 = tfIdf.toarray()[1]
numerator = np.dot(vector1, vector2)

vector1Len = np.linalg.norm(vector1)
vector2Len = np.linalg.norm(vector2)
denomenator = vector1Len*vector2Len

cosine = numerator/denomenator
angle_radians = np.arccos(cosine)
angle_degrees = angle_radians*360/2/np.pi
angle_degrees.round(2)

86.18