**Корректность проверена на Python 3.7:**
+ pandas 0.23.0
+ numpy 1.14.5
+ sklearn 0.19.1
+ nltk 3.2.4

# Анализ тональности отзывов

Сначала возьмем выборку отзывов на фильмы из NLTK:

In [2]:
import nltk
nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\ximfa\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\movie_reviews.zip.


True

In [3]:
from nltk.corpus import movie_reviews
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')

print(negids[:5])

['neg/cv000_29416.txt', 'neg/cv001_19502.txt', 'neg/cv002_17424.txt', 'neg/cv003_12683.txt', 'neg/cv004_12641.txt']


Приготовим список текстов и классов как обучающую выборку:

In [4]:
negfeats = [" ".join(movie_reviews.words(fileids=[f])) for f in negids]
posfeats = [" ".join(movie_reviews.words(fileids=[f])) for f in posids]

texts = negfeats + posfeats
labels = [0] * len(negfeats) + [1] * len(posfeats)

In [5]:
print(texts[0])

plot : two teen couples go to a church party , drink and then drive . they get into an accident . one of the guys dies , but his girlfriend continues to see him in her life , and has nightmares . what ' s the deal ? watch the movie and " sorta " find out . . . critique : a mind - fuck movie for the teen generation that touches on a very cool idea , but presents it in a very bad package . which is what makes this review an even harder one to write , since i generally applaud films which attempt to break the mold , mess with your head and such ( lost highway & memento ) , but there are good and bad ways of making all types of films , and these folks just didn ' t snag this one correctly . they seem to have taken this pretty neat concept , but executed it terribly . so what are the problems with the movie ? well , its main problem is that it ' s simply too jumbled . it starts off " normal " but then downshifts into this " fantasy " world in which you , as an audience member , have no idea

Импортируем нужные нам модули

In [6]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline

### Оценка качества работы разных классификаторов

In [7]:
def text_classifier(vectorizer, transformer, classifier):
    return Pipeline(
            [("vectorizer", vectorizer),
            ("transformer", transformer),
            ("classifier", classifier)]
        )

In [8]:
for clf in [LogisticRegression, LinearSVC, SGDClassifier]:
    print(clf)
    print(cross_val_score(text_classifier(CountVectorizer(), TfidfTransformer(), clf(max_iter=1000)), texts, labels).mean())
    print("\n")

<class 'sklearn.linear_model._logistic.LogisticRegression'>
0.8205


<class 'sklearn.svm._classes.LinearSVC'>
0.8545


<class 'sklearn.linear_model._stochastic_gradient.SGDClassifier'>
0.857




### Подготовка классификатора, обученного на всех данных

In [9]:
clf_pipeline = Pipeline(
            [("vectorizer", TfidfVectorizer()),
            ("classifier", LinearSVC())]
        )


clf_pipeline.fit(texts, labels)

print(clf_pipeline)

Pipeline(steps=[('vectorizer', TfidfVectorizer()), ('classifier', LinearSVC())])


In [10]:
print(clf_pipeline.predict(["Amazing film! I will advice it to all my friends. Genious",
                           "Awful film! The man who advised me to watch it is really crazy idiot."]))

[1 0]


## Понижение размерности и ансамбли деревьев

In [11]:
%%time
from sklearn.decomposition import NMF, TruncatedSVD

v = CountVectorizer()
mx = v.fit_transform(texts)
mf = TruncatedSVD(10)
u = mf.fit_transform(mx)

Wall time: 3.82 s


In [12]:
for transform in [TruncatedSVD, NMF]:
    print(transform)
    print(cross_val_score(text_classifier(CountVectorizer(), transform(n_components=10), LinearSVC()), texts, labels).mean())
    print("\n")


<class 'sklearn.decomposition._truncated_svd.TruncatedSVD'>




0.5285


<class 'sklearn.decomposition._nmf.NMF'>




0.655







Если задать n_components=1000:

In [13]:
%%time
print(cross_val_score(text_classifier(TfidfVectorizer(), TruncatedSVD(n_components=1000), LinearSVC()),
                      texts, 
                      labels
                     ).mean())

0.852
Wall time: 4min 35s


## Ансамбли деревьев на преобразованных признаках

In [14]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
#!/usr/bin/env python -W ignore::DeprecationWarning

In [15]:
%%time
print(cross_val_score(
    Pipeline([
            ("vectorizer", CountVectorizer()),
            ("transformer", TruncatedSVD(100)),
            ("classifier", RandomForestClassifier(100))
        ]),
    texts,
    labels
    ))

[0.72   0.7175 0.7025 0.725  0.715 ]
Wall time: 44.2 s


Больше компонент и больше деревьев:

In [None]:
%%time
print(cross_val_score(text_classifier(CountVectorizer(), TruncatedSVD(n_components=1000), RandomForestClassifier(1000)),
                      texts, 
                      labels
                     ).mean())

Tf*Idf вместо частот слов:

In [None]:
%%time
print(cross_val_score(text_classifier(TfidfVectorizer(), TruncatedSVD(n_components=1000), RandomForestClassifier(1000)),
                      texts, 
                      labels
                     ).mean())

## Совмещаем Tf*Idf и SVD

In [18]:
from sklearn.pipeline import FeatureUnion

estimators = [('tfidf', TfidfTransformer()), ('svd', TruncatedSVD(1))]
combined = FeatureUnion(estimators)

In [None]:
%%time
print(cross_val_score(
    Pipeline([
            ("vectorizer", CountVectorizer()),
            ("transformer", combined),
            ("classifier", LinearSVC())
        ]),
    texts,
    labels
    ))

### HW

In [20]:
import pandas as pd

In [23]:
A=pd.read_table('SMSSpamCollection.txt', header=None)

In [25]:
A.columns=['class', 'text']

In [26]:
A

Unnamed: 0,class,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [60]:
A.text[1]

'Ok lar... Joking wif u oni...'

In [31]:
r={'ham':0, 'spam':1}

In [35]:
A['class']=A['class'].apply(lambda x: r[x])

In [36]:
A

Unnamed: 0,class,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will ü b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [40]:
labels=A['class'].values

In [41]:
labels

array([0, 0, 1, ..., 0, 0, 0], dtype=int64)

In [70]:
texsts=A.text.values

In [71]:
pipe=Pipeline([('vect', CountVectorizer()), ('reg', LogisticRegression())])

In [53]:
from sklearn.model_selection import KFold

In [55]:
cv=KFold(10, random_state=2, shuffle=True)

In [73]:
cross_val_score(pipe, X=texsts, y=labels, cv=cv, scoring='f1').mean()

0.9284249957857755

In [74]:
pipe.fit(texsts, labels)

Pipeline(steps=[('vect', CountVectorizer()), ('reg', LogisticRegression())])

In [77]:
pipe.predict(["FreeMsg: Txt: CALL to No: 86888 & claim your reward of 3 hours talk time to use from your phone now! Subscribe6GB", 
             "FreeMsg: Txt: claim your reward of 3 hours talk time",
              "Have you visited the last lecture on physics?",
              "Have you visited the last lecture on physics? Just buy this book and you will have all materials! Only 99$",
              "Only 99$"])

array([1, 1, 0, 0, 0], dtype=int64)

### N-граммы

In [78]:
cross_val_score(Pipeline([('vect', CountVectorizer(ngram_range=(2,2))), ('reg', LogisticRegression())]), X=texsts, y=labels, cv=cv, scoring='f1').mean()

0.815011238543369

In [79]:
cross_val_score(Pipeline([('vect', CountVectorizer(ngram_range=(3,3))), ('reg', LogisticRegression())]), X=texsts, y=labels, cv=cv, scoring='f1').mean()

0.7288421110512258

In [80]:
cross_val_score(Pipeline([('vect', CountVectorizer(ngram_range=(1,3))), ('reg', LogisticRegression())]), X=texsts, y=labels, cv=cv, scoring='f1').mean()

0.9243406573154095

In [81]:
from sklearn.naive_bayes import MultinomialNB

In [86]:
pipe1=Pipeline([('vect', CountVectorizer()), ('reg', MultinomialNB())])

In [90]:
cross_val_score(Pipeline([('vect', CountVectorizer(ngram_range=(2,2))), ('reg', MultinomialNB())]), X=texsts, y=labels, cv=cv, scoring='f1').mean()

0.9310202983728818

In [91]:
cross_val_score(Pipeline([('vect', CountVectorizer(ngram_range=(3,3))), ('reg', MultinomialNB())]), X=texsts, y=labels, cv=cv, scoring='f1').mean()

0.870209468089137

In [92]:
cross_val_score(Pipeline([('vect', CountVectorizer(ngram_range=(1,3))), ('reg', MultinomialNB())]), X=texsts, y=labels, cv=cv, scoring='f1').mean()

0.9453513399098696

In [93]:
cross_val_score(Pipeline([('vect', CountVectorizer()), ('reg', MultinomialNB())]), X=texsts, y=labels, cv=cv, scoring='f1').mean()

0.9469869611310789

In [95]:
for vectorizer in [CountVectorizer(ngram_range=(2,2)), CountVectorizer(ngram_range=(3,3)),
                   CountVectorizer(ngram_range=(1,3)), CountVectorizer()]:
    X=vectorizer.fit_transform(texsts)
    print (vectorizer)
    print(cross_val_score(MultinomialNB(), X, labels, cv=cv, scoring='f1').mean())

CountVectorizer(ngram_range=(2, 2))
0.640635467655301
CountVectorizer(ngram_range=(3, 3))
0.3769766535692895
CountVectorizer(ngram_range=(1, 3))
0.8878969545119839
CountVectorizer()
0.9301942685556985


In [88]:
pipe1.fit(texsts, labels)

Pipeline(steps=[('vect', CountVectorizer()), ('reg', MultinomialNB())])

In [89]:
pipe1.predict(["FreeMsg: Txt: CALL to No: 86888 & claim your reward of 3 hours talk time to use from your phone now! Subscribe6GB", 
             "FreeMsg: Txt: claim your reward of 3 hours talk time",
              "Have you visited the last lecture on physics?",
              "Have you visited the last lecture on physics? Just buy this book and you will have all materials! Only 99$",
              "Only 99$"])

array([1, 1, 0, 0, 1], dtype=int64)

In [94]:
cross_val_score(Pipeline([('vect', TfidfVectorizer()), ('reg', LogisticRegression())]), X=texsts, y=labels, cv=cv, scoring='f1').mean()

0.8850984614629759